1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for SPARCv9, vanilla, as well 18# as VIS3 and FMA extensions. 19# 20# May, August 2015 21# 22# Numbers are cycles per processed byte with poly1305_blocks alone. 23# 24# IALU(*) FMA 25# 26# UltraSPARC III 12.3(**) 27# SPARC T3 7.92 28# SPARC T4 1.70(***) 6.55 29# SPARC64 X 5.60 3.64 30# 31# (*) Comparison to compiler-generated code is really problematic, 32# because latter's performance varies too much depending on too 33# many variables. For example, one can measure from 5x to 15x 34# improvement on T4 for gcc-4.6. Well, in T4 case it's a bit 35# unfair comparison, because compiler doesn't use VIS3, but 36# given same initial conditions coefficient varies from 3x to 9x. 37# (**) Pre-III performance should be even worse; floating-point 38# performance for UltraSPARC I-IV on the other hand is reported 39# to be 4.25 for hand-coded assembly, but they are just too old 40# to care about. 41# (***) Multi-process benchmark saturates at ~12.5x single-process 42# result on 8-core processor, or ~21GBps per 2.85GHz socket. 43 44my $output = pop; 45open STDOUT,">$output"; 46 47my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5)); 48my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7)); 49my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7)); 50my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4)); 51 52my $output = pop; 53open STDOUT,">$stdout"; 54 55$code.=<<___; 56#include "sparc_arch.h" 57 58#ifdef __arch64__ 59.register %g2,#scratch 60.register %g3,#scratch 61# define STPTR stx 62# define SIZE_T 8 63#else 64# define STPTR st 65# define SIZE_T 4 66#endif 67#define LOCALS (STACK_BIAS+STACK_FRAME) 68 69.section ".text",#alloc,#execinstr 70 71#ifdef __PIC__ 72SPARC_PIC_THUNK(%g1) 73#endif 74 75.globl poly1305_init 76.align 32 77poly1305_init: 78 save %sp,-STACK_FRAME-16,%sp 79 nop 80 81 SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1) 82 ld [%g1],%g1 83 84 and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1 85 cmp %g1,SPARCV9_FMADD 86 be .Lpoly1305_init_fma 87 nop 88 89 stx %g0,[$ctx+0] 90 stx %g0,[$ctx+8] ! zero hash value 91 brz,pn $inp,.Lno_key 92 stx %g0,[$ctx+16] 93 94 and $inp,7,$shr ! alignment factor 95 andn $inp,7,$inp 96 sll $shr,3,$shr ! *8 97 neg $shr,$shl 98 99 sethi %hi(0x0ffffffc),$t0 100 set 8,$h1 101 or $t0,%lo(0x0ffffffc),$t0 102 set 16,$h2 103 sllx $t0,32,$t1 104 or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc 105 or $t1,3,$t0 ! 0x0ffffffc0fffffff 106 107 ldxa [$inp+%g0]0x88,$h0 ! load little-endian key 108 brz,pt $shr,.Lkey_aligned 109 ldxa [$inp+$h1]0x88,$h1 110 111 ldxa [$inp+$h2]0x88,$h2 112 srlx $h0,$shr,$h0 113 sllx $h1,$shl,$t2 114 srlx $h1,$shr,$h1 115 or $t2,$h0,$h0 116 sllx $h2,$shl,$h2 117 or $h2,$h1,$h1 118 119.Lkey_aligned: 120 and $t0,$h0,$h0 121 and $t1,$h1,$h1 122 stx $h0,[$ctx+32+0] ! store key 123 stx $h1,[$ctx+32+8] 124 125 andcc %g1,SPARCV9_VIS3,%g0 126 be .Lno_key 127 nop 128 1291: call .+8 130 add %o7,poly1305_blocks_vis3-1b,%o7 131 132 add %o7,poly1305_emit-poly1305_blocks_vis3,%o5 133 STPTR %o7,[%i2] 134 STPTR %o5,[%i2+SIZE_T] 135 136 ret 137 restore %g0,1,%o0 ! return 1 138 139.Lno_key: 140 ret 141 restore %g0,%g0,%o0 ! return 0 142.type poly1305_init,#function 143.size poly1305_init,.-poly1305_init 144 145.globl poly1305_blocks 146.align 32 147poly1305_blocks: 148 save %sp,-STACK_FRAME,%sp 149 srln $len,4,$len 150 151 brz,pn $len,.Lno_data 152 nop 153 154 ld [$ctx+32+0],$r1 ! load key 155 ld [$ctx+32+4],$r0 156 ld [$ctx+32+8],$r3 157 ld [$ctx+32+12],$r2 158 159 ld [$ctx+0],$h1 ! load hash value 160 ld [$ctx+4],$h0 161 ld [$ctx+8],$h3 162 ld [$ctx+12],$h2 163 ld [$ctx+16],$h4 164 165 and $inp,7,$shr ! alignment factor 166 andn $inp,7,$inp 167 set 8,$d1 168 sll $shr,3,$shr ! *8 169 set 16,$d2 170 neg $shr,$shl 171 172 srl $r1,2,$s1 173 srl $r2,2,$s2 174 add $r1,$s1,$s1 175 srl $r3,2,$s3 176 add $r2,$s2,$s2 177 add $r3,$s3,$s3 178 179.Loop: 180 ldxa [$inp+%g0]0x88,$d0 ! load little-endian input 181 brz,pt $shr,.Linp_aligned 182 ldxa [$inp+$d1]0x88,$d1 183 184 ldxa [$inp+$d2]0x88,$d2 185 srlx $d0,$shr,$d0 186 sllx $d1,$shl,$t1 187 srlx $d1,$shr,$d1 188 or $t1,$d0,$d0 189 sllx $d2,$shl,$d2 190 or $d2,$d1,$d1 191 192.Linp_aligned: 193 srlx $d0,32,$t0 194 addcc $d0,$h0,$h0 ! accumulate input 195 srlx $d1,32,$t1 196 addccc $t0,$h1,$h1 197 addccc $d1,$h2,$h2 198 addccc $t1,$h3,$h3 199 addc $padbit,$h4,$h4 200 201 umul $r0,$h0,$d0 202 umul $r1,$h0,$d1 203 umul $r2,$h0,$d2 204 umul $r3,$h0,$d3 205 sub $len,1,$len 206 add $inp,16,$inp 207 208 umul $s3,$h1,$t0 209 umul $r0,$h1,$t1 210 umul $r1,$h1,$t2 211 add $t0,$d0,$d0 212 add $t1,$d1,$d1 213 umul $r2,$h1,$t0 214 add $t2,$d2,$d2 215 add $t0,$d3,$d3 216 217 umul $s2,$h2,$t1 218 umul $s3,$h2,$t2 219 umul $r0,$h2,$t0 220 add $t1,$d0,$d0 221 add $t2,$d1,$d1 222 umul $r1,$h2,$t1 223 add $t0,$d2,$d2 224 add $t1,$d3,$d3 225 226 umul $s1,$h3,$t2 227 umul $s2,$h3,$t0 228 umul $s3,$h3,$t1 229 add $t2,$d0,$d0 230 add $t0,$d1,$d1 231 umul $r0,$h3,$t2 232 add $t1,$d2,$d2 233 add $t2,$d3,$d3 234 235 umul $s1,$h4,$t0 236 umul $s2,$h4,$t1 237 umul $s3,$h4,$t2 238 umul $r0,$h4,$h4 239 add $t0,$d1,$d1 240 add $t1,$d2,$d2 241 srlx $d0,32,$h1 242 add $t2,$d3,$d3 243 srlx $d1,32,$h2 244 245 addcc $d1,$h1,$h1 246 srlx $d2,32,$h3 247 set 8,$d1 248 addccc $d2,$h2,$h2 249 srlx $d3,32,$t0 250 set 16,$d2 251 addccc $d3,$h3,$h3 252 addc $t0,$h4,$h4 253 254 srl $h4,2,$t0 ! final reduction step 255 andn $h4,3,$t1 256 and $h4,3,$h4 257 add $t1,$t0,$t0 258 259 addcc $t0,$d0,$h0 260 addccc %g0,$h1,$h1 261 addccc %g0,$h2,$h2 262 addccc %g0,$h3,$h3 263 brnz,pt $len,.Loop 264 addc %g0,$h4,$h4 265 266 st $h1,[$ctx+0] ! store hash value 267 st $h0,[$ctx+4] 268 st $h3,[$ctx+8] 269 st $h2,[$ctx+12] 270 st $h4,[$ctx+16] 271 272.Lno_data: 273 ret 274 restore 275.type poly1305_blocks,#function 276.size poly1305_blocks,.-poly1305_blocks 277___ 278######################################################################## 279# VIS3 has umulxhi and addxc... 280{ 281my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7)); 282my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4)); 283 284$code.=<<___; 285.align 32 286poly1305_blocks_vis3: 287 save %sp,-STACK_FRAME,%sp 288 srln $len,4,$len 289 290 brz,pn $len,.Lno_data 291 nop 292 293 ldx [$ctx+32+0],$R0 ! load key 294 ldx [$ctx+32+8],$R1 295 296 ldx [$ctx+0],$H0 ! load hash value 297 ldx [$ctx+8],$H1 298 ld [$ctx+16],$H2 299 300 and $inp,7,$shr ! alignment factor 301 andn $inp,7,$inp 302 set 8,$r1 303 sll $shr,3,$shr ! *8 304 set 16,$r2 305 neg $shr,$shl 306 307 srlx $R1,2,$S1 308 b .Loop_vis3 309 add $R1,$S1,$S1 310 311.Loop_vis3: 312 ldxa [$inp+%g0]0x88,$D0 ! load little-endian input 313 brz,pt $shr,.Linp_aligned_vis3 314 ldxa [$inp+$r1]0x88,$D1 315 316 ldxa [$inp+$r2]0x88,$D2 317 srlx $D0,$shr,$D0 318 sllx $D1,$shl,$T1 319 srlx $D1,$shr,$D1 320 or $T1,$D0,$D0 321 sllx $D2,$shl,$D2 322 or $D2,$D1,$D1 323 324.Linp_aligned_vis3: 325 addcc $D0,$H0,$H0 ! accumulate input 326 sub $len,1,$len 327 addxccc $D1,$H1,$H1 328 add $inp,16,$inp 329 330 mulx $R0,$H0,$D0 ! r0*h0 331 addxc $padbit,$H2,$H2 332 umulxhi $R0,$H0,$D1 333 mulx $S1,$H1,$T0 ! s1*h1 334 umulxhi $S1,$H1,$T1 335 addcc $T0,$D0,$D0 336 mulx $R1,$H0,$T0 ! r1*h0 337 addxc $T1,$D1,$D1 338 umulxhi $R1,$H0,$D2 339 addcc $T0,$D1,$D1 340 mulx $R0,$H1,$T0 ! r0*h1 341 addxc %g0,$D2,$D2 342 umulxhi $R0,$H1,$T1 343 addcc $T0,$D1,$D1 344 mulx $S1,$H2,$T0 ! s1*h2 345 addxc $T1,$D2,$D2 346 mulx $R0,$H2,$T1 ! r0*h2 347 addcc $T0,$D1,$D1 348 addxc $T1,$D2,$D2 349 350 srlx $D2,2,$T0 ! final reduction step 351 andn $D2,3,$T1 352 and $D2,3,$H2 353 add $T1,$T0,$T0 354 355 addcc $T0,$D0,$H0 356 addxccc %g0,$D1,$H1 357 brnz,pt $len,.Loop_vis3 358 addxc %g0,$H2,$H2 359 360 stx $H0,[$ctx+0] ! store hash value 361 stx $H1,[$ctx+8] 362 st $H2,[$ctx+16] 363 364 ret 365 restore 366.type poly1305_blocks_vis3,#function 367.size poly1305_blocks_vis3,.-poly1305_blocks_vis3 368___ 369} 370my ($mac,$nonce) = ($inp,$len); 371 372$code.=<<___; 373.globl poly1305_emit 374.align 32 375poly1305_emit: 376 save %sp,-STACK_FRAME,%sp 377 378 ld [$ctx+0],$h1 ! load hash value 379 ld [$ctx+4],$h0 380 ld [$ctx+8],$h3 381 ld [$ctx+12],$h2 382 ld [$ctx+16],$h4 383 384 addcc $h0,5,$r0 ! compare to modulus 385 addccc $h1,0,$r1 386 addccc $h2,0,$r2 387 addccc $h3,0,$r3 388 addc $h4,0,$h4 389 andcc $h4,4,%g0 ! did it carry/borrow? 390 391 movnz %icc,$r0,$h0 392 ld [$nonce+0],$r0 ! load nonce 393 movnz %icc,$r1,$h1 394 ld [$nonce+4],$r1 395 movnz %icc,$r2,$h2 396 ld [$nonce+8],$r2 397 movnz %icc,$r3,$h3 398 ld [$nonce+12],$r3 399 400 addcc $r0,$h0,$h0 ! accumulate nonce 401 addccc $r1,$h1,$h1 402 addccc $r2,$h2,$h2 403 addc $r3,$h3,$h3 404 405 srl $h0,8,$r0 406 stb $h0,[$mac+0] ! store little-endian result 407 srl $h0,16,$r1 408 stb $r0,[$mac+1] 409 srl $h0,24,$r2 410 stb $r1,[$mac+2] 411 stb $r2,[$mac+3] 412 413 srl $h1,8,$r0 414 stb $h1,[$mac+4] 415 srl $h1,16,$r1 416 stb $r0,[$mac+5] 417 srl $h1,24,$r2 418 stb $r1,[$mac+6] 419 stb $r2,[$mac+7] 420 421 srl $h2,8,$r0 422 stb $h2,[$mac+8] 423 srl $h2,16,$r1 424 stb $r0,[$mac+9] 425 srl $h2,24,$r2 426 stb $r1,[$mac+10] 427 stb $r2,[$mac+11] 428 429 srl $h3,8,$r0 430 stb $h3,[$mac+12] 431 srl $h3,16,$r1 432 stb $r0,[$mac+13] 433 srl $h3,24,$r2 434 stb $r1,[$mac+14] 435 stb $r2,[$mac+15] 436 437 ret 438 restore 439.type poly1305_emit,#function 440.size poly1305_emit,.-poly1305_emit 441___ 442 443{ 444my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3)); 445my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4)); 446my ($i1,$step,$shr,$shl) = map("%l$_",(0..7)); 447my $i2=$step; 448 449my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, 450 $two0,$two32,$two64,$two96,$two130,$five_two130, 451 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, 452 $s2lo,$s2hi,$s3lo,$s3hi, 453 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31)); 454# borrowings 455my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); 456my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); 457my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo); 458 459$code.=<<___; 460.align 32 461poly1305_init_fma: 462 save %sp,-STACK_FRAME-16,%sp 463 nop 464 465.Lpoly1305_init_fma: 4661: call .+8 467 add %o7,.Lconsts_fma-1b,%o7 468 469 ldd [%o7+8*0],$two0 ! load constants 470 ldd [%o7+8*1],$two32 471 ldd [%o7+8*2],$two64 472 ldd [%o7+8*3],$two96 473 ldd [%o7+8*5],$five_two130 474 475 std $two0,[$ctx+8*0] ! initial hash value, biased 0 476 std $two32,[$ctx+8*1] 477 std $two64,[$ctx+8*2] 478 std $two96,[$ctx+8*3] 479 480 brz,pn $inp,.Lno_key_fma 481 nop 482 483 stx %fsr,[%sp+LOCALS] ! save original %fsr 484 ldx [%o7+8*6],%fsr ! load new %fsr 485 486 std $two0,[$ctx+8*4] ! key "template" 487 std $two32,[$ctx+8*5] 488 std $two64,[$ctx+8*6] 489 std $two96,[$ctx+8*7] 490 491 and $inp,7,$shr 492 andn $inp,7,$inp ! align pointer 493 mov 8,$i1 494 sll $shr,3,$shr 495 mov 16,$i2 496 neg $shr,$shl 497 498 ldxa [$inp+%g0]0x88,$in0 ! load little-endian key 499 ldxa [$inp+$i1]0x88,$in2 500 501 brz $shr,.Lkey_aligned_fma 502 sethi %hi(0xf0000000),$i1 ! 0xf0000000 503 504 ldxa [$inp+$i2]0x88,$in4 505 506 srlx $in0,$shr,$in0 ! align data 507 sllx $in2,$shl,$in1 508 srlx $in2,$shr,$in2 509 or $in1,$in0,$in0 510 sllx $in4,$shl,$in3 511 or $in3,$in2,$in2 512 513.Lkey_aligned_fma: 514 or $i1,3,$i2 ! 0xf0000003 515 srlx $in0,32,$in1 516 andn $in0,$i1,$in0 ! &=0x0fffffff 517 andn $in1,$i2,$in1 ! &=0x0ffffffc 518 srlx $in2,32,$in3 519 andn $in2,$i2,$in2 520 andn $in3,$i2,$in3 521 522 st $in0,[$ctx+`8*4+4`] ! fill "template" 523 st $in1,[$ctx+`8*5+4`] 524 st $in2,[$ctx+`8*6+4`] 525 st $in3,[$ctx+`8*7+4`] 526 527 ldd [$ctx+8*4],$h0lo ! load [biased] key 528 ldd [$ctx+8*5],$h1lo 529 ldd [$ctx+8*6],$h2lo 530 ldd [$ctx+8*7],$h3lo 531 532 fsubd $h0lo,$two0, $h0lo ! r0 533 ldd [%o7+8*7],$two0 ! more constants 534 fsubd $h1lo,$two32,$h1lo ! r1 535 ldd [%o7+8*8],$two32 536 fsubd $h2lo,$two64,$h2lo ! r2 537 ldd [%o7+8*9],$two64 538 fsubd $h3lo,$two96,$h3lo ! r3 539 ldd [%o7+8*10],$two96 540 541 fmuld $five_two130,$h1lo,$s1lo ! s1 542 fmuld $five_two130,$h2lo,$s2lo ! s2 543 fmuld $five_two130,$h3lo,$s3lo ! s3 544 545 faddd $h0lo,$two0, $h0hi 546 faddd $h1lo,$two32,$h1hi 547 faddd $h2lo,$two64,$h2hi 548 faddd $h3lo,$two96,$h3hi 549 550 fsubd $h0hi,$two0, $h0hi 551 ldd [%o7+8*11],$two0 ! more constants 552 fsubd $h1hi,$two32,$h1hi 553 ldd [%o7+8*12],$two32 554 fsubd $h2hi,$two64,$h2hi 555 ldd [%o7+8*13],$two64 556 fsubd $h3hi,$two96,$h3hi 557 558 fsubd $h0lo,$h0hi,$h0lo 559 std $h0hi,[$ctx+8*5] ! r0hi 560 fsubd $h1lo,$h1hi,$h1lo 561 std $h1hi,[$ctx+8*7] ! r1hi 562 fsubd $h2lo,$h2hi,$h2lo 563 std $h2hi,[$ctx+8*9] ! r2hi 564 fsubd $h3lo,$h3hi,$h3lo 565 std $h3hi,[$ctx+8*11] ! r3hi 566 567 faddd $s1lo,$two0, $s1hi 568 faddd $s2lo,$two32,$s2hi 569 faddd $s3lo,$two64,$s3hi 570 571 fsubd $s1hi,$two0, $s1hi 572 fsubd $s2hi,$two32,$s2hi 573 fsubd $s3hi,$two64,$s3hi 574 575 fsubd $s1lo,$s1hi,$s1lo 576 fsubd $s2lo,$s2hi,$s2lo 577 fsubd $s3lo,$s3hi,$s3lo 578 579 ldx [%sp+LOCALS],%fsr ! restore %fsr 580 581 std $h0lo,[$ctx+8*4] ! r0lo 582 std $h1lo,[$ctx+8*6] ! r1lo 583 std $h2lo,[$ctx+8*8] ! r2lo 584 std $h3lo,[$ctx+8*10] ! r3lo 585 586 std $s1hi,[$ctx+8*13] 587 std $s2hi,[$ctx+8*15] 588 std $s3hi,[$ctx+8*17] 589 590 std $s1lo,[$ctx+8*12] 591 std $s2lo,[$ctx+8*14] 592 std $s3lo,[$ctx+8*16] 593 594 add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0 595 add %o7,poly1305_emit_fma-.Lconsts_fma,%o1 596 STPTR %o0,[%i2] 597 STPTR %o1,[%i2+SIZE_T] 598 599 ret 600 restore %g0,1,%o0 ! return 1 601 602.Lno_key_fma: 603 ret 604 restore %g0,%g0,%o0 ! return 0 605.type poly1305_init_fma,#function 606.size poly1305_init_fma,.-poly1305_init_fma 607 608.align 32 609poly1305_blocks_fma: 610 save %sp,-STACK_FRAME-48,%sp 611 srln $len,4,$len 612 613 brz,pn $len,.Labort 614 sub $len,1,$len 615 6161: call .+8 617 add %o7,.Lconsts_fma-1b,%o7 618 619 ldd [%o7+8*0],$two0 ! load constants 620 ldd [%o7+8*1],$two32 621 ldd [%o7+8*2],$two64 622 ldd [%o7+8*3],$two96 623 ldd [%o7+8*4],$two130 624 ldd [%o7+8*5],$five_two130 625 626 ldd [$ctx+8*0],$h0lo ! load [biased] hash value 627 ldd [$ctx+8*1],$h1lo 628 ldd [$ctx+8*2],$h2lo 629 ldd [$ctx+8*3],$h3lo 630 631 std $two0,[%sp+LOCALS+8*0] ! input "template" 632 sethi %hi((1023+52+96)<<20),$in3 633 std $two32,[%sp+LOCALS+8*1] 634 or $padbit,$in3,$in3 635 std $two64,[%sp+LOCALS+8*2] 636 st $in3,[%sp+LOCALS+8*3] 637 638 and $inp,7,$shr 639 andn $inp,7,$inp ! align pointer 640 mov 8,$i1 641 sll $shr,3,$shr 642 mov 16,$step 643 neg $shr,$shl 644 645 ldxa [$inp+%g0]0x88,$in0 ! load little-endian input 646 brz $shr,.Linp_aligned_fma 647 ldxa [$inp+$i1]0x88,$in2 648 649 ldxa [$inp+$step]0x88,$in4 650 add $inp,8,$inp 651 652 srlx $in0,$shr,$in0 ! align data 653 sllx $in2,$shl,$in1 654 srlx $in2,$shr,$in2 655 or $in1,$in0,$in0 656 sllx $in4,$shl,$in3 657 srlx $in4,$shr,$in4 ! pre-shift 658 or $in3,$in2,$in2 659 660.Linp_aligned_fma: 661 srlx $in0,32,$in1 662 movrz $len,0,$step 663 srlx $in2,32,$in3 664 add $step,$inp,$inp ! conditional advance 665 666 st $in0,[%sp+LOCALS+8*0+4] ! fill "template" 667 st $in1,[%sp+LOCALS+8*1+4] 668 st $in2,[%sp+LOCALS+8*2+4] 669 st $in3,[%sp+LOCALS+8*3+4] 670 671 ldd [$ctx+8*4],$r0lo ! load key 672 ldd [$ctx+8*5],$r0hi 673 ldd [$ctx+8*6],$r1lo 674 ldd [$ctx+8*7],$r1hi 675 ldd [$ctx+8*8],$r2lo 676 ldd [$ctx+8*9],$r2hi 677 ldd [$ctx+8*10],$r3lo 678 ldd [$ctx+8*11],$r3hi 679 ldd [$ctx+8*12],$s1lo 680 ldd [$ctx+8*13],$s1hi 681 ldd [$ctx+8*14],$s2lo 682 ldd [$ctx+8*15],$s2hi 683 ldd [$ctx+8*16],$s3lo 684 ldd [$ctx+8*17],$s3hi 685 686 stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr 687 ldx [%o7+8*6],%fsr ! load new %fsr 688 689 subcc $len,1,$len 690 movrz $len,0,$step 691 692 ldd [%sp+LOCALS+8*0],$x0 ! load biased input 693 ldd [%sp+LOCALS+8*1],$x1 694 ldd [%sp+LOCALS+8*2],$x2 695 ldd [%sp+LOCALS+8*3],$x3 696 697 fsubd $h0lo,$two0, $h0lo ! de-bias hash value 698 fsubd $h1lo,$two32,$h1lo 699 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load 700 fsubd $h2lo,$two64,$h2lo 701 fsubd $h3lo,$two96,$h3lo 702 ldxa [$inp+$i1]0x88,$in2 703 704 fsubd $x0,$two0, $x0 ! de-bias input 705 fsubd $x1,$two32,$x1 706 fsubd $x2,$two64,$x2 707 fsubd $x3,$two96,$x3 708 709 brz $shr,.Linp_aligned_fma2 710 add $step,$inp,$inp ! conditional advance 711 712 sllx $in0,$shl,$in1 ! align data 713 srlx $in0,$shr,$in3 714 or $in1,$in4,$in0 715 sllx $in2,$shl,$in1 716 srlx $in2,$shr,$in4 ! pre-shift 717 or $in3,$in1,$in2 718.Linp_aligned_fma2: 719 srlx $in0,32,$in1 720 srlx $in2,32,$in3 721 722 faddd $h0lo,$x0,$x0 ! accumulate input 723 stw $in0,[%sp+LOCALS+8*0+4] 724 faddd $h1lo,$x1,$x1 725 stw $in1,[%sp+LOCALS+8*1+4] 726 faddd $h2lo,$x2,$x2 727 stw $in2,[%sp+LOCALS+8*2+4] 728 faddd $h3lo,$x3,$x3 729 stw $in3,[%sp+LOCALS+8*3+4] 730 731 b .Lentry_fma 732 nop 733 734.align 16 735.Loop_fma: 736 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load 737 ldxa [$inp+$i1]0x88,$in2 738 movrz $len,0,$step 739 740 faddd $y0,$h0lo,$h0lo ! accumulate input 741 faddd $y1,$h0hi,$h0hi 742 faddd $y2,$h2lo,$h2lo 743 faddd $y3,$h2hi,$h2hi 744 745 brz,pn $shr,.Linp_aligned_fma3 746 add $step,$inp,$inp ! conditional advance 747 748 sllx $in0,$shl,$in1 ! align data 749 srlx $in0,$shr,$in3 750 or $in1,$in4,$in0 751 sllx $in2,$shl,$in1 752 srlx $in2,$shr,$in4 ! pre-shift 753 or $in3,$in1,$in2 754 755.Linp_aligned_fma3: 756 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 757 faddd $two64,$h1lo,$c1lo 758 srlx $in0,32,$in1 759 faddd $two64,$h1hi,$c1hi 760 srlx $in2,32,$in3 761 faddd $two130,$h3lo,$c3lo 762 st $in0,[%sp+LOCALS+8*0+4] ! fill "template" 763 faddd $two130,$h3hi,$c3hi 764 st $in1,[%sp+LOCALS+8*1+4] 765 faddd $two32,$h0lo,$c0lo 766 st $in2,[%sp+LOCALS+8*2+4] 767 faddd $two32,$h0hi,$c0hi 768 st $in3,[%sp+LOCALS+8*3+4] 769 faddd $two96,$h2lo,$c2lo 770 faddd $two96,$h2hi,$c2hi 771 772 fsubd $c1lo,$two64,$c1lo 773 fsubd $c1hi,$two64,$c1hi 774 fsubd $c3lo,$two130,$c3lo 775 fsubd $c3hi,$two130,$c3hi 776 fsubd $c0lo,$two32,$c0lo 777 fsubd $c0hi,$two32,$c0hi 778 fsubd $c2lo,$two96,$c2lo 779 fsubd $c2hi,$two96,$c2hi 780 781 fsubd $h1lo,$c1lo,$h1lo 782 fsubd $h1hi,$c1hi,$h1hi 783 fsubd $h3lo,$c3lo,$h3lo 784 fsubd $h3hi,$c3hi,$h3hi 785 fsubd $h2lo,$c2lo,$h2lo 786 fsubd $h2hi,$c2hi,$h2hi 787 fsubd $h0lo,$c0lo,$h0lo 788 fsubd $h0hi,$c0hi,$h0hi 789 790 faddd $h1lo,$c0lo,$h1lo 791 faddd $h1hi,$c0hi,$h1hi 792 faddd $h3lo,$c2lo,$h3lo 793 faddd $h3hi,$c2hi,$h3hi 794 faddd $h2lo,$c1lo,$h2lo 795 faddd $h2hi,$c1hi,$h2hi 796 fmaddd $five_two130,$c3lo,$h0lo,$h0lo 797 fmaddd $five_two130,$c3hi,$h0hi,$h0hi 798 799 faddd $h1lo,$h1hi,$x1 800 ldd [$ctx+8*12],$s1lo ! reload constants 801 faddd $h3lo,$h3hi,$x3 802 ldd [$ctx+8*13],$s1hi 803 faddd $h2lo,$h2hi,$x2 804 ldd [$ctx+8*10],$r3lo 805 faddd $h0lo,$h0hi,$x0 806 ldd [$ctx+8*11],$r3hi 807 808.Lentry_fma: 809 fmuld $x1,$s3lo,$h0lo 810 fmuld $x1,$s3hi,$h0hi 811 fmuld $x1,$r1lo,$h2lo 812 fmuld $x1,$r1hi,$h2hi 813 fmuld $x1,$r0lo,$h1lo 814 fmuld $x1,$r0hi,$h1hi 815 fmuld $x1,$r2lo,$h3lo 816 fmuld $x1,$r2hi,$h3hi 817 818 fmaddd $x3,$s1lo,$h0lo,$h0lo 819 fmaddd $x3,$s1hi,$h0hi,$h0hi 820 fmaddd $x3,$s3lo,$h2lo,$h2lo 821 fmaddd $x3,$s3hi,$h2hi,$h2hi 822 fmaddd $x3,$s2lo,$h1lo,$h1lo 823 fmaddd $x3,$s2hi,$h1hi,$h1hi 824 fmaddd $x3,$r0lo,$h3lo,$h3lo 825 fmaddd $x3,$r0hi,$h3hi,$h3hi 826 827 fmaddd $x2,$s2lo,$h0lo,$h0lo 828 fmaddd $x2,$s2hi,$h0hi,$h0hi 829 fmaddd $x2,$r0lo,$h2lo,$h2lo 830 fmaddd $x2,$r0hi,$h2hi,$h2hi 831 fmaddd $x2,$s3lo,$h1lo,$h1lo 832 ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input 833 fmaddd $x2,$s3hi,$h1hi,$h1hi 834 ldd [%sp+LOCALS+8*1],$y1 835 fmaddd $x2,$r1lo,$h3lo,$h3lo 836 ldd [%sp+LOCALS+8*2],$y2 837 fmaddd $x2,$r1hi,$h3hi,$h3hi 838 ldd [%sp+LOCALS+8*3],$y3 839 840 fmaddd $x0,$r0lo,$h0lo,$h0lo 841 fsubd $y0,$two0, $y0 ! de-bias input 842 fmaddd $x0,$r0hi,$h0hi,$h0hi 843 fsubd $y1,$two32,$y1 844 fmaddd $x0,$r2lo,$h2lo,$h2lo 845 fsubd $y2,$two64,$y2 846 fmaddd $x0,$r2hi,$h2hi,$h2hi 847 fsubd $y3,$two96,$y3 848 fmaddd $x0,$r1lo,$h1lo,$h1lo 849 fmaddd $x0,$r1hi,$h1hi,$h1hi 850 fmaddd $x0,$r3lo,$h3lo,$h3lo 851 fmaddd $x0,$r3hi,$h3hi,$h3hi 852 853 bcc SIZE_T_CC,.Loop_fma 854 subcc $len,1,$len 855 856 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 857 faddd $h0lo,$two32,$c0lo 858 faddd $h0hi,$two32,$c0hi 859 faddd $h2lo,$two96,$c2lo 860 faddd $h2hi,$two96,$c2hi 861 faddd $h1lo,$two64,$c1lo 862 faddd $h1hi,$two64,$c1hi 863 faddd $h3lo,$two130,$c3lo 864 faddd $h3hi,$two130,$c3hi 865 866 fsubd $c0lo,$two32,$c0lo 867 fsubd $c0hi,$two32,$c0hi 868 fsubd $c2lo,$two96,$c2lo 869 fsubd $c2hi,$two96,$c2hi 870 fsubd $c1lo,$two64,$c1lo 871 fsubd $c1hi,$two64,$c1hi 872 fsubd $c3lo,$two130,$c3lo 873 fsubd $c3hi,$two130,$c3hi 874 875 fsubd $h1lo,$c1lo,$h1lo 876 fsubd $h1hi,$c1hi,$h1hi 877 fsubd $h3lo,$c3lo,$h3lo 878 fsubd $h3hi,$c3hi,$h3hi 879 fsubd $h2lo,$c2lo,$h2lo 880 fsubd $h2hi,$c2hi,$h2hi 881 fsubd $h0lo,$c0lo,$h0lo 882 fsubd $h0hi,$c0hi,$h0hi 883 884 faddd $h1lo,$c0lo,$h1lo 885 faddd $h1hi,$c0hi,$h1hi 886 faddd $h3lo,$c2lo,$h3lo 887 faddd $h3hi,$c2hi,$h3hi 888 faddd $h2lo,$c1lo,$h2lo 889 faddd $h2hi,$c1hi,$h2hi 890 fmaddd $five_two130,$c3lo,$h0lo,$h0lo 891 fmaddd $five_two130,$c3hi,$h0hi,$h0hi 892 893 faddd $h1lo,$h1hi,$x1 894 faddd $h3lo,$h3hi,$x3 895 faddd $h2lo,$h2hi,$x2 896 faddd $h0lo,$h0hi,$x0 897 898 faddd $x1,$two32,$x1 ! bias 899 faddd $x3,$two96,$x3 900 faddd $x2,$two64,$x2 901 faddd $x0,$two0, $x0 902 903 ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr 904 905 std $x1,[$ctx+8*1] ! store [biased] hash value 906 std $x3,[$ctx+8*3] 907 std $x2,[$ctx+8*2] 908 std $x0,[$ctx+8*0] 909 910.Labort: 911 ret 912 restore 913.type poly1305_blocks_fma,#function 914.size poly1305_blocks_fma,.-poly1305_blocks_fma 915___ 916{ 917my ($mac,$nonce)=($inp,$len); 918 919my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask 920 ) = (map("%l$_",(0..5)),map("%o$_",(0..4))); 921 922$code.=<<___; 923.align 32 924poly1305_emit_fma: 925 save %sp,-STACK_FRAME,%sp 926 927 ld [$ctx+8*0+0],$d0 ! load hash 928 ld [$ctx+8*0+4],$h0 929 ld [$ctx+8*1+0],$d1 930 ld [$ctx+8*1+4],$h1 931 ld [$ctx+8*2+0],$d2 932 ld [$ctx+8*2+4],$h2 933 ld [$ctx+8*3+0],$d3 934 ld [$ctx+8*3+4],$h3 935 936 sethi %hi(0xfff00000),$mask 937 andn $d0,$mask,$d0 ! mask exponent 938 andn $d1,$mask,$d1 939 andn $d2,$mask,$d2 940 andn $d3,$mask,$d3 ! can be partially reduced... 941 mov 3,$mask 942 943 srl $d3,2,$padbit ! ... so reduce 944 and $d3,$mask,$h4 945 andn $d3,$mask,$d3 946 add $padbit,$d3,$d3 947 948 addcc $d3,$h0,$h0 949 addccc $d0,$h1,$h1 950 addccc $d1,$h2,$h2 951 addccc $d2,$h3,$h3 952 addc %g0,$h4,$h4 953 954 addcc $h0,5,$d0 ! compare to modulus 955 addccc $h1,0,$d1 956 addccc $h2,0,$d2 957 addccc $h3,0,$d3 958 addc $h4,0,$mask 959 960 srl $mask,2,$mask ! did it carry/borrow? 961 neg $mask,$mask 962 sra $mask,31,$mask ! mask 963 964 andn $h0,$mask,$h0 965 and $d0,$mask,$d0 966 andn $h1,$mask,$h1 967 and $d1,$mask,$d1 968 or $d0,$h0,$h0 969 ld [$nonce+0],$d0 ! load nonce 970 andn $h2,$mask,$h2 971 and $d2,$mask,$d2 972 or $d1,$h1,$h1 973 ld [$nonce+4],$d1 974 andn $h3,$mask,$h3 975 and $d3,$mask,$d3 976 or $d2,$h2,$h2 977 ld [$nonce+8],$d2 978 or $d3,$h3,$h3 979 ld [$nonce+12],$d3 980 981 addcc $d0,$h0,$h0 ! accumulate nonce 982 addccc $d1,$h1,$h1 983 addccc $d2,$h2,$h2 984 addc $d3,$h3,$h3 985 986 stb $h0,[$mac+0] ! write little-endian result 987 srl $h0,8,$h0 988 stb $h1,[$mac+4] 989 srl $h1,8,$h1 990 stb $h2,[$mac+8] 991 srl $h2,8,$h2 992 stb $h3,[$mac+12] 993 srl $h3,8,$h3 994 995 stb $h0,[$mac+1] 996 srl $h0,8,$h0 997 stb $h1,[$mac+5] 998 srl $h1,8,$h1 999 stb $h2,[$mac+9] 1000 srl $h2,8,$h2 1001 stb $h3,[$mac+13] 1002 srl $h3,8,$h3 1003 1004 stb $h0,[$mac+2] 1005 srl $h0,8,$h0 1006 stb $h1,[$mac+6] 1007 srl $h1,8,$h1 1008 stb $h2,[$mac+10] 1009 srl $h2,8,$h2 1010 stb $h3,[$mac+14] 1011 srl $h3,8,$h3 1012 1013 stb $h0,[$mac+3] 1014 stb $h1,[$mac+7] 1015 stb $h2,[$mac+11] 1016 stb $h3,[$mac+15] 1017 1018 ret 1019 restore 1020.type poly1305_emit_fma,#function 1021.size poly1305_emit_fma,.-poly1305_emit_fma 1022___ 1023} 1024 1025$code.=<<___; 1026.align 64 1027.Lconsts_fma: 1028.word 0x43300000,0x00000000 ! 2^(52+0) 1029.word 0x45300000,0x00000000 ! 2^(52+32) 1030.word 0x47300000,0x00000000 ! 2^(52+64) 1031.word 0x49300000,0x00000000 ! 2^(52+96) 1032.word 0x4b500000,0x00000000 ! 2^(52+130) 1033 1034.word 0x37f40000,0x00000000 ! 5/2^130 1035.word 0,1<<30 ! fsr: truncate, no exceptions 1036 1037.word 0x44300000,0x00000000 ! 2^(52+16+0) 1038.word 0x46300000,0x00000000 ! 2^(52+16+32) 1039.word 0x48300000,0x00000000 ! 2^(52+16+64) 1040.word 0x4a300000,0x00000000 ! 2^(52+16+96) 1041.word 0x3e300000,0x00000000 ! 2^(52+16+0-96) 1042.word 0x40300000,0x00000000 ! 2^(52+16+32-96) 1043.word 0x42300000,0x00000000 ! 2^(52+16+64-96) 1044.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>" 1045.align 4 1046___ 1047} 1048 1049# Purpose of these subroutines is to explicitly encode VIS instructions, 1050# so that one can compile the module without having to specify VIS 1051# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1052# Idea is to reserve for option to produce "universal" binary and let 1053# programmer detect if current CPU is VIS capable at run-time. 1054sub unvis3 { 1055my ($mnemonic,$rs1,$rs2,$rd)=@_; 1056my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1057my ($ref,$opf); 1058my %visopf = ( "addxc" => 0x011, 1059 "addxccc" => 0x013, 1060 "umulxhi" => 0x016 ); 1061 1062 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1063 1064 if ($opf=$visopf{$mnemonic}) { 1065 foreach ($rs1,$rs2,$rd) { 1066 return $ref if (!/%([goli])([0-9])/); 1067 $_=$bias{$1}+$2; 1068 } 1069 1070 return sprintf ".word\t0x%08x !%s", 1071 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1072 $ref; 1073 } else { 1074 return $ref; 1075 } 1076} 1077 1078sub unfma { 1079my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1080my ($ref,$opf); 1081my %fmaopf = ( "fmadds" => 0x1, 1082 "fmaddd" => 0x2, 1083 "fmsubs" => 0x5, 1084 "fmsubd" => 0x6 ); 1085 1086 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1087 1088 if ($opf=$fmaopf{$mnemonic}) { 1089 foreach ($rs1,$rs2,$rs3,$rd) { 1090 return $ref if (!/%f([0-9]{1,2})/); 1091 $_=$1; 1092 if ($1>=32) { 1093 return $ref if ($1&1); 1094 # re-encode for upper double register addressing 1095 $_=($1|$1>>5)&31; 1096 } 1097 } 1098 1099 return sprintf ".word\t0x%08x !%s", 1100 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1101 $ref; 1102 } else { 1103 return $ref; 1104 } 1105} 1106 1107foreach (split("\n",$code)) { 1108 s/\`([^\`]*)\`/eval $1/ge; 1109 1110 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1111 &unvis3($1,$2,$3,$4) 1112 /ge or 1113 s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/ 1114 &unfma($1,$2,$3,$4,$5) 1115 /ge; 1116 1117 print $_,"\n"; 1118} 1119 1120close STDOUT or die "error closing STDOUT: $!"; 1121