1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for SPARCv9. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. In the process of adaptation 23# original .c module was made 32-bit savvy in order to make this 24# implementation possible. 25# 26# with/without -DECP_NISTZ256_ASM 27# UltraSPARC III +12-18% 28# SPARC T4 +99-550% (+66-150% on 32-bit Solaris) 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +200% means 3x improvement. 33 34$output = pop; 35open STDOUT,">$output"; 36 37$code.=<<___; 38#include "sparc_arch.h" 39 40#define LOCALS (STACK_BIAS+STACK_FRAME) 41#ifdef __arch64__ 42.register %g2,#scratch 43.register %g3,#scratch 44# define STACK64_FRAME STACK_FRAME 45# define LOCALS64 LOCALS 46#else 47# define STACK64_FRAME (2047+192) 48# define LOCALS64 STACK64_FRAME 49#endif 50 51.section ".text",#alloc,#execinstr 52___ 53######################################################################## 54# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 55# 56$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 57open TABLE,"<ecp_nistz256_table.c" or 58open TABLE,"<${dir}../ecp_nistz256_table.c" or 59die "failed to open ecp_nistz256_table.c:",$!; 60 61use integer; 62 63foreach(<TABLE>) { 64 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 65} 66close TABLE; 67 68# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 69# 64*16*37-1 is because $#arr returns last valid index or @arr, not 70# amount of elements. 71die "insane number of elements" if ($#arr != 64*16*37-1); 72 73$code.=<<___; 74.globl ecp_nistz256_precomputed 75.align 4096 76ecp_nistz256_precomputed: 77___ 78######################################################################## 79# this conversion smashes P256_POINT_AFFINE by individual bytes with 80# 64 byte interval, similar to 81# 1111222233334444 82# 1234123412341234 83for(1..37) { 84 @tbl = splice(@arr,0,64*16); 85 for($i=0;$i<64;$i++) { 86 undef @line; 87 for($j=0;$j<64;$j++) { 88 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 89 } 90 $code.=".byte\t"; 91 $code.=join(',',map { sprintf "0x%02x",$_} @line); 92 $code.="\n"; 93 } 94} 95 96{{{ 97my ($rp,$ap,$bp)=map("%i$_",(0..2)); 98my @acc=map("%l$_",(0..7)); 99my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5"); 100my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1"); 101my ($rp_real,$ap_real)=("%g2","%g3"); 102 103$code.=<<___; 104.type ecp_nistz256_precomputed,#object 105.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 106.align 64 107.LRR: ! 2^512 mod P precomputed for NIST P256 polynomial 108.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb 109.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 110.Lone: 111.long 1,0,0,0,0,0,0,0 112.asciz "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 113 114! void ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 115.globl ecp_nistz256_to_mont 116.align 64 117ecp_nistz256_to_mont: 118 save %sp,-STACK_FRAME,%sp 119 nop 1201: call .+8 121 add %o7,.LRR-1b,$bp 122 call __ecp_nistz256_mul_mont 123 nop 124 ret 125 restore 126.type ecp_nistz256_to_mont,#function 127.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 128 129! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 130.globl ecp_nistz256_from_mont 131.align 32 132ecp_nistz256_from_mont: 133 save %sp,-STACK_FRAME,%sp 134 nop 1351: call .+8 136 add %o7,.Lone-1b,$bp 137 call __ecp_nistz256_mul_mont 138 nop 139 ret 140 restore 141.type ecp_nistz256_from_mont,#function 142.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 143 144! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8], 145! const BN_ULONG %i2[8]); 146.globl ecp_nistz256_mul_mont 147.align 32 148ecp_nistz256_mul_mont: 149 save %sp,-STACK_FRAME,%sp 150 nop 151 call __ecp_nistz256_mul_mont 152 nop 153 ret 154 restore 155.type ecp_nistz256_mul_mont,#function 156.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 157 158! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]); 159.globl ecp_nistz256_sqr_mont 160.align 32 161ecp_nistz256_sqr_mont: 162 save %sp,-STACK_FRAME,%sp 163 mov $ap,$bp 164 call __ecp_nistz256_mul_mont 165 nop 166 ret 167 restore 168.type ecp_nistz256_sqr_mont,#function 169.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 170___ 171 172######################################################################## 173# Special thing to keep in mind is that $t0-$t7 hold 64-bit values, 174# while all others are meant to keep 32. "Meant to" means that additions 175# to @acc[0-7] do "contaminate" upper bits, but they are cleared before 176# they can affect outcome (follow 'and' with $mask). Also keep in mind 177# that addition with carry is addition with 32-bit carry, even though 178# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see 179# below for VIS3 code paths.] 180 181$code.=<<___; 182.align 32 183__ecp_nistz256_mul_mont: 184 ld [$bp+0],$bi ! b[0] 185 mov -1,$mask 186 ld [$ap+0],$a0 187 srl $mask,0,$mask ! 0xffffffff 188 ld [$ap+4],$t1 189 ld [$ap+8],$t2 190 ld [$ap+12],$t3 191 ld [$ap+16],$t4 192 ld [$ap+20],$t5 193 ld [$ap+24],$t6 194 ld [$ap+28],$t7 195 mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results 196 mulx $t1,$bi,$t1 197 mulx $t2,$bi,$t2 198 mulx $t3,$bi,$t3 199 mulx $t4,$bi,$t4 200 mulx $t5,$bi,$t5 201 mulx $t6,$bi,$t6 202 mulx $t7,$bi,$t7 203 srlx $t0,32,@acc[1] ! extract high parts 204 srlx $t1,32,@acc[2] 205 srlx $t2,32,@acc[3] 206 srlx $t3,32,@acc[4] 207 srlx $t4,32,@acc[5] 208 srlx $t5,32,@acc[6] 209 srlx $t6,32,@acc[7] 210 srlx $t7,32,@acc[0] ! "@acc[8]" 211 mov 0,$carry 212___ 213for($i=1;$i<8;$i++) { 214$code.=<<___; 215 addcc @acc[1],$t1,@acc[1] ! accumulate high parts 216 ld [$bp+4*$i],$bi ! b[$i] 217 ld [$ap+4],$t1 ! re-load a[1-7] 218 addccc @acc[2],$t2,@acc[2] 219 addccc @acc[3],$t3,@acc[3] 220 ld [$ap+8],$t2 221 ld [$ap+12],$t3 222 addccc @acc[4],$t4,@acc[4] 223 addccc @acc[5],$t5,@acc[5] 224 ld [$ap+16],$t4 225 ld [$ap+20],$t5 226 addccc @acc[6],$t6,@acc[6] 227 addccc @acc[7],$t7,@acc[7] 228 ld [$ap+24],$t6 229 ld [$ap+28],$t7 230 addccc @acc[0],$carry,@acc[0] ! "@acc[8]" 231 addc %g0,%g0,$carry 232___ 233 # Reduction iteration is normally performed by accumulating 234 # result of multiplication of modulus by "magic" digit [and 235 # omitting least significant word, which is guaranteed to 236 # be 0], but thanks to special form of modulus and "magic" 237 # digit being equal to least significant word, it can be 238 # performed with additions and subtractions alone. Indeed: 239 # 240 # ffff.0001.0000.0000.0000.ffff.ffff.ffff 241 # * abcd 242 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 243 # 244 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 245 # rewrite above as: 246 # 247 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 248 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 249 # - abcd.0000.0000.0000.0000.0000.0000.abcd 250 # 251 # or marking redundant operations: 252 # 253 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 254 # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 255 # - abcd.----.----.----.----.----.----.---- 256 257$code.=<<___; 258 ! multiplication-less reduction 259 addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0] 260 addccc @acc[4],%g0,@acc[4] ! r[4]+=0 261 and @acc[1],$mask,@acc[1] 262 and @acc[2],$mask,@acc[2] 263 addccc @acc[5],%g0,@acc[5] ! r[5]+=0 264 addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0] 265 and @acc[3],$mask,@acc[3] 266 and @acc[4],$mask,@acc[4] 267 addccc @acc[7],%g0,@acc[7] ! r[7]+=0 268 addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]" 269 and @acc[5],$mask,@acc[5] 270 and @acc[6],$mask,@acc[6] 271 addc $carry,%g0,$carry ! top-most carry 272 subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0] 273 subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]" 274 subc $carry,%g0,$carry ! top-most carry 275 and @acc[7],$mask,@acc[7] 276 and @acc[0],$mask,@acc[0] ! "@acc[8]" 277___ 278 push(@acc,shift(@acc)); # rotate registers to "omit" acc[0] 279$code.=<<___; 280 mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results 281 mulx $t1,$bi,$t1 282 mulx $t2,$bi,$t2 283 mulx $t3,$bi,$t3 284 mulx $t4,$bi,$t4 285 mulx $t5,$bi,$t5 286 mulx $t6,$bi,$t6 287 mulx $t7,$bi,$t7 288 add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow 289 add @acc[1],$t1,$t1 290 srlx $t0,32,@acc[1] ! extract high parts 291 add @acc[2],$t2,$t2 292 srlx $t1,32,@acc[2] 293 add @acc[3],$t3,$t3 294 srlx $t2,32,@acc[3] 295 add @acc[4],$t4,$t4 296 srlx $t3,32,@acc[4] 297 add @acc[5],$t5,$t5 298 srlx $t4,32,@acc[5] 299 add @acc[6],$t6,$t6 300 srlx $t5,32,@acc[6] 301 add @acc[7],$t7,$t7 302 srlx $t6,32,@acc[7] 303 srlx $t7,32,@acc[0] ! "@acc[8]" 304___ 305} 306$code.=<<___; 307 addcc @acc[1],$t1,@acc[1] ! accumulate high parts 308 addccc @acc[2],$t2,@acc[2] 309 addccc @acc[3],$t3,@acc[3] 310 addccc @acc[4],$t4,@acc[4] 311 addccc @acc[5],$t5,@acc[5] 312 addccc @acc[6],$t6,@acc[6] 313 addccc @acc[7],$t7,@acc[7] 314 addccc @acc[0],$carry,@acc[0] ! "@acc[8]" 315 addc %g0,%g0,$carry 316 317 addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction 318 addccc @acc[4],%g0,@acc[4] 319 addccc @acc[5],%g0,@acc[5] 320 addccc @acc[6],$t0,@acc[6] 321 addccc @acc[7],%g0,@acc[7] 322 addccc @acc[0],$t0,@acc[0] ! "@acc[8]" 323 addc $carry,%g0,$carry 324 subcc @acc[7],$t0,@acc[7] 325 subccc @acc[0],%g0,@acc[0] ! "@acc[8]" 326 subc $carry,%g0,$carry ! top-most carry 327___ 328 push(@acc,shift(@acc)); # rotate registers to omit acc[0] 329$code.=<<___; 330 ! Final step is "if result > mod, subtract mod", but we do it 331 ! "other way around", namely subtract modulus from result 332 ! and if it borrowed, add modulus back. 333 334 subcc @acc[0],-1,@acc[0] ! subtract modulus 335 subccc @acc[1],-1,@acc[1] 336 subccc @acc[2],-1,@acc[2] 337 subccc @acc[3],0,@acc[3] 338 subccc @acc[4],0,@acc[4] 339 subccc @acc[5],0,@acc[5] 340 subccc @acc[6],1,@acc[6] 341 subccc @acc[7],-1,@acc[7] 342 subc $carry,0,$carry ! broadcast borrow bit 343 344 ! Note that because mod has special form, i.e. consists of 345 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by 346 ! using value of broadcasted borrow and the borrow bit itself. 347 ! To minimize dependency chain we first broadcast and then 348 ! extract the bit by negating (follow $bi). 349 350 addcc @acc[0],$carry,@acc[0] ! add modulus or zero 351 addccc @acc[1],$carry,@acc[1] 352 neg $carry,$bi 353 st @acc[0],[$rp] 354 addccc @acc[2],$carry,@acc[2] 355 st @acc[1],[$rp+4] 356 addccc @acc[3],0,@acc[3] 357 st @acc[2],[$rp+8] 358 addccc @acc[4],0,@acc[4] 359 st @acc[3],[$rp+12] 360 addccc @acc[5],0,@acc[5] 361 st @acc[4],[$rp+16] 362 addccc @acc[6],$bi,@acc[6] 363 st @acc[5],[$rp+20] 364 addc @acc[7],$carry,@acc[7] 365 st @acc[6],[$rp+24] 366 retl 367 st @acc[7],[$rp+28] 368.type __ecp_nistz256_mul_mont,#function 369.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 370 371! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8], 372! const BN_ULONG %i2[8]); 373.globl ecp_nistz256_add 374.align 32 375ecp_nistz256_add: 376 save %sp,-STACK_FRAME,%sp 377 ld [$ap],@acc[0] 378 ld [$ap+4],@acc[1] 379 ld [$ap+8],@acc[2] 380 ld [$ap+12],@acc[3] 381 ld [$ap+16],@acc[4] 382 ld [$ap+20],@acc[5] 383 ld [$ap+24],@acc[6] 384 call __ecp_nistz256_add 385 ld [$ap+28],@acc[7] 386 ret 387 restore 388.type ecp_nistz256_add,#function 389.size ecp_nistz256_add,.-ecp_nistz256_add 390 391.align 32 392__ecp_nistz256_add: 393 ld [$bp+0],$t0 ! b[0] 394 ld [$bp+4],$t1 395 ld [$bp+8],$t2 396 ld [$bp+12],$t3 397 addcc @acc[0],$t0,@acc[0] 398 ld [$bp+16],$t4 399 ld [$bp+20],$t5 400 addccc @acc[1],$t1,@acc[1] 401 ld [$bp+24],$t6 402 ld [$bp+28],$t7 403 addccc @acc[2],$t2,@acc[2] 404 addccc @acc[3],$t3,@acc[3] 405 addccc @acc[4],$t4,@acc[4] 406 addccc @acc[5],$t5,@acc[5] 407 addccc @acc[6],$t6,@acc[6] 408 addccc @acc[7],$t7,@acc[7] 409 addc %g0,%g0,$carry 410 411.Lreduce_by_sub: 412 413 ! if a+b >= modulus, subtract modulus. 414 ! 415 ! But since comparison implies subtraction, we subtract 416 ! modulus and then add it back if subtraction borrowed. 417 418 subcc @acc[0],-1,@acc[0] 419 subccc @acc[1],-1,@acc[1] 420 subccc @acc[2],-1,@acc[2] 421 subccc @acc[3], 0,@acc[3] 422 subccc @acc[4], 0,@acc[4] 423 subccc @acc[5], 0,@acc[5] 424 subccc @acc[6], 1,@acc[6] 425 subccc @acc[7],-1,@acc[7] 426 subc $carry,0,$carry 427 428 ! Note that because mod has special form, i.e. consists of 429 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by 430 ! using value of borrow and its negative. 431 432 addcc @acc[0],$carry,@acc[0] ! add synthesized modulus 433 addccc @acc[1],$carry,@acc[1] 434 neg $carry,$bi 435 st @acc[0],[$rp] 436 addccc @acc[2],$carry,@acc[2] 437 st @acc[1],[$rp+4] 438 addccc @acc[3],0,@acc[3] 439 st @acc[2],[$rp+8] 440 addccc @acc[4],0,@acc[4] 441 st @acc[3],[$rp+12] 442 addccc @acc[5],0,@acc[5] 443 st @acc[4],[$rp+16] 444 addccc @acc[6],$bi,@acc[6] 445 st @acc[5],[$rp+20] 446 addc @acc[7],$carry,@acc[7] 447 st @acc[6],[$rp+24] 448 retl 449 st @acc[7],[$rp+28] 450.type __ecp_nistz256_add,#function 451.size __ecp_nistz256_add,.-__ecp_nistz256_add 452 453! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 454.globl ecp_nistz256_mul_by_2 455.align 32 456ecp_nistz256_mul_by_2: 457 save %sp,-STACK_FRAME,%sp 458 ld [$ap],@acc[0] 459 ld [$ap+4],@acc[1] 460 ld [$ap+8],@acc[2] 461 ld [$ap+12],@acc[3] 462 ld [$ap+16],@acc[4] 463 ld [$ap+20],@acc[5] 464 ld [$ap+24],@acc[6] 465 call __ecp_nistz256_mul_by_2 466 ld [$ap+28],@acc[7] 467 ret 468 restore 469.type ecp_nistz256_mul_by_2,#function 470.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 471 472.align 32 473__ecp_nistz256_mul_by_2: 474 addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a 475 addccc @acc[1],@acc[1],@acc[1] 476 addccc @acc[2],@acc[2],@acc[2] 477 addccc @acc[3],@acc[3],@acc[3] 478 addccc @acc[4],@acc[4],@acc[4] 479 addccc @acc[5],@acc[5],@acc[5] 480 addccc @acc[6],@acc[6],@acc[6] 481 addccc @acc[7],@acc[7],@acc[7] 482 b .Lreduce_by_sub 483 addc %g0,%g0,$carry 484.type __ecp_nistz256_mul_by_2,#function 485.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 486 487! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 488.globl ecp_nistz256_mul_by_3 489.align 32 490ecp_nistz256_mul_by_3: 491 save %sp,-STACK_FRAME,%sp 492 ld [$ap],@acc[0] 493 ld [$ap+4],@acc[1] 494 ld [$ap+8],@acc[2] 495 ld [$ap+12],@acc[3] 496 ld [$ap+16],@acc[4] 497 ld [$ap+20],@acc[5] 498 ld [$ap+24],@acc[6] 499 call __ecp_nistz256_mul_by_3 500 ld [$ap+28],@acc[7] 501 ret 502 restore 503.type ecp_nistz256_mul_by_3,#function 504.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 505 506.align 32 507__ecp_nistz256_mul_by_3: 508 addcc @acc[0],@acc[0],$t0 ! a+a=2*a 509 addccc @acc[1],@acc[1],$t1 510 addccc @acc[2],@acc[2],$t2 511 addccc @acc[3],@acc[3],$t3 512 addccc @acc[4],@acc[4],$t4 513 addccc @acc[5],@acc[5],$t5 514 addccc @acc[6],@acc[6],$t6 515 addccc @acc[7],@acc[7],$t7 516 addc %g0,%g0,$carry 517 518 subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores 519 subccc $t1,-1,$t1 520 subccc $t2,-1,$t2 521 subccc $t3, 0,$t3 522 subccc $t4, 0,$t4 523 subccc $t5, 0,$t5 524 subccc $t6, 1,$t6 525 subccc $t7,-1,$t7 526 subc $carry,0,$carry 527 528 addcc $t0,$carry,$t0 ! add synthesized modulus 529 addccc $t1,$carry,$t1 530 neg $carry,$bi 531 addccc $t2,$carry,$t2 532 addccc $t3,0,$t3 533 addccc $t4,0,$t4 534 addccc $t5,0,$t5 535 addccc $t6,$bi,$t6 536 addc $t7,$carry,$t7 537 538 addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a 539 addccc $t1,@acc[1],@acc[1] 540 addccc $t2,@acc[2],@acc[2] 541 addccc $t3,@acc[3],@acc[3] 542 addccc $t4,@acc[4],@acc[4] 543 addccc $t5,@acc[5],@acc[5] 544 addccc $t6,@acc[6],@acc[6] 545 addccc $t7,@acc[7],@acc[7] 546 b .Lreduce_by_sub 547 addc %g0,%g0,$carry 548.type __ecp_nistz256_mul_by_3,#function 549.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 550 551! void ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8], 552! const BN_ULONG %i2[8]); 553.globl ecp_nistz256_sub 554.align 32 555ecp_nistz256_sub: 556 save %sp,-STACK_FRAME,%sp 557 ld [$ap],@acc[0] 558 ld [$ap+4],@acc[1] 559 ld [$ap+8],@acc[2] 560 ld [$ap+12],@acc[3] 561 ld [$ap+16],@acc[4] 562 ld [$ap+20],@acc[5] 563 ld [$ap+24],@acc[6] 564 call __ecp_nistz256_sub_from 565 ld [$ap+28],@acc[7] 566 ret 567 restore 568.type ecp_nistz256_sub,#function 569.size ecp_nistz256_sub,.-ecp_nistz256_sub 570 571! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 572.globl ecp_nistz256_neg 573.align 32 574ecp_nistz256_neg: 575 save %sp,-STACK_FRAME,%sp 576 mov $ap,$bp 577 mov 0,@acc[0] 578 mov 0,@acc[1] 579 mov 0,@acc[2] 580 mov 0,@acc[3] 581 mov 0,@acc[4] 582 mov 0,@acc[5] 583 mov 0,@acc[6] 584 call __ecp_nistz256_sub_from 585 mov 0,@acc[7] 586 ret 587 restore 588.type ecp_nistz256_neg,#function 589.size ecp_nistz256_neg,.-ecp_nistz256_neg 590 591.align 32 592__ecp_nistz256_sub_from: 593 ld [$bp+0],$t0 ! b[0] 594 ld [$bp+4],$t1 595 ld [$bp+8],$t2 596 ld [$bp+12],$t3 597 subcc @acc[0],$t0,@acc[0] 598 ld [$bp+16],$t4 599 ld [$bp+20],$t5 600 subccc @acc[1],$t1,@acc[1] 601 subccc @acc[2],$t2,@acc[2] 602 ld [$bp+24],$t6 603 ld [$bp+28],$t7 604 subccc @acc[3],$t3,@acc[3] 605 subccc @acc[4],$t4,@acc[4] 606 subccc @acc[5],$t5,@acc[5] 607 subccc @acc[6],$t6,@acc[6] 608 subccc @acc[7],$t7,@acc[7] 609 subc %g0,%g0,$carry ! broadcast borrow bit 610 611.Lreduce_by_add: 612 613 ! if a-b borrows, add modulus. 614 ! 615 ! Note that because mod has special form, i.e. consists of 616 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by 617 ! using value of broadcasted borrow and the borrow bit itself. 618 ! To minimize dependency chain we first broadcast and then 619 ! extract the bit by negating (follow $bi). 620 621 addcc @acc[0],$carry,@acc[0] ! add synthesized modulus 622 addccc @acc[1],$carry,@acc[1] 623 neg $carry,$bi 624 st @acc[0],[$rp] 625 addccc @acc[2],$carry,@acc[2] 626 st @acc[1],[$rp+4] 627 addccc @acc[3],0,@acc[3] 628 st @acc[2],[$rp+8] 629 addccc @acc[4],0,@acc[4] 630 st @acc[3],[$rp+12] 631 addccc @acc[5],0,@acc[5] 632 st @acc[4],[$rp+16] 633 addccc @acc[6],$bi,@acc[6] 634 st @acc[5],[$rp+20] 635 addc @acc[7],$carry,@acc[7] 636 st @acc[6],[$rp+24] 637 retl 638 st @acc[7],[$rp+28] 639.type __ecp_nistz256_sub_from,#function 640.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 641 642.align 32 643__ecp_nistz256_sub_morf: 644 ld [$bp+0],$t0 ! b[0] 645 ld [$bp+4],$t1 646 ld [$bp+8],$t2 647 ld [$bp+12],$t3 648 subcc $t0,@acc[0],@acc[0] 649 ld [$bp+16],$t4 650 ld [$bp+20],$t5 651 subccc $t1,@acc[1],@acc[1] 652 subccc $t2,@acc[2],@acc[2] 653 ld [$bp+24],$t6 654 ld [$bp+28],$t7 655 subccc $t3,@acc[3],@acc[3] 656 subccc $t4,@acc[4],@acc[4] 657 subccc $t5,@acc[5],@acc[5] 658 subccc $t6,@acc[6],@acc[6] 659 subccc $t7,@acc[7],@acc[7] 660 b .Lreduce_by_add 661 subc %g0,%g0,$carry ! broadcast borrow bit 662.type __ecp_nistz256_sub_morf,#function 663.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 664 665! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 666.globl ecp_nistz256_div_by_2 667.align 32 668ecp_nistz256_div_by_2: 669 save %sp,-STACK_FRAME,%sp 670 ld [$ap],@acc[0] 671 ld [$ap+4],@acc[1] 672 ld [$ap+8],@acc[2] 673 ld [$ap+12],@acc[3] 674 ld [$ap+16],@acc[4] 675 ld [$ap+20],@acc[5] 676 ld [$ap+24],@acc[6] 677 call __ecp_nistz256_div_by_2 678 ld [$ap+28],@acc[7] 679 ret 680 restore 681.type ecp_nistz256_div_by_2,#function 682.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 683 684.align 32 685__ecp_nistz256_div_by_2: 686 ! ret = (a is odd ? a+mod : a) >> 1 687 688 and @acc[0],1,$bi 689 neg $bi,$carry 690 addcc @acc[0],$carry,@acc[0] 691 addccc @acc[1],$carry,@acc[1] 692 addccc @acc[2],$carry,@acc[2] 693 addccc @acc[3],0,@acc[3] 694 addccc @acc[4],0,@acc[4] 695 addccc @acc[5],0,@acc[5] 696 addccc @acc[6],$bi,@acc[6] 697 addccc @acc[7],$carry,@acc[7] 698 addc %g0,%g0,$carry 699 700 ! ret >>= 1 701 702 srl @acc[0],1,@acc[0] 703 sll @acc[1],31,$t0 704 srl @acc[1],1,@acc[1] 705 or @acc[0],$t0,@acc[0] 706 sll @acc[2],31,$t1 707 srl @acc[2],1,@acc[2] 708 or @acc[1],$t1,@acc[1] 709 sll @acc[3],31,$t2 710 st @acc[0],[$rp] 711 srl @acc[3],1,@acc[3] 712 or @acc[2],$t2,@acc[2] 713 sll @acc[4],31,$t3 714 st @acc[1],[$rp+4] 715 srl @acc[4],1,@acc[4] 716 or @acc[3],$t3,@acc[3] 717 sll @acc[5],31,$t4 718 st @acc[2],[$rp+8] 719 srl @acc[5],1,@acc[5] 720 or @acc[4],$t4,@acc[4] 721 sll @acc[6],31,$t5 722 st @acc[3],[$rp+12] 723 srl @acc[6],1,@acc[6] 724 or @acc[5],$t5,@acc[5] 725 sll @acc[7],31,$t6 726 st @acc[4],[$rp+16] 727 srl @acc[7],1,@acc[7] 728 or @acc[6],$t6,@acc[6] 729 sll $carry,31,$t7 730 st @acc[5],[$rp+20] 731 or @acc[7],$t7,@acc[7] 732 st @acc[6],[$rp+24] 733 retl 734 st @acc[7],[$rp+28] 735.type __ecp_nistz256_div_by_2,#function 736.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 737___ 738 739######################################################################## 740# following subroutines are "literal" implementation of those found in 741# ecp_nistz256.c 742# 743######################################################################## 744# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 745# 746{ 747my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 748# above map() describes stack layout with 4 temporary 749# 256-bit vectors on top. 750 751$code.=<<___; 752#ifdef __PIC__ 753SPARC_PIC_THUNK(%g1) 754#endif 755 756.globl ecp_nistz256_point_double 757.align 32 758ecp_nistz256_point_double: 759 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 760 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] 761 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 762 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) 763 be ecp_nistz256_point_double_vis3 764 nop 765 766 save %sp,-STACK_FRAME-32*4,%sp 767 768 mov $rp,$rp_real 769 mov $ap,$ap_real 770 771.Lpoint_double_shortcut: 772 ld [$ap+32],@acc[0] 773 ld [$ap+32+4],@acc[1] 774 ld [$ap+32+8],@acc[2] 775 ld [$ap+32+12],@acc[3] 776 ld [$ap+32+16],@acc[4] 777 ld [$ap+32+20],@acc[5] 778 ld [$ap+32+24],@acc[6] 779 ld [$ap+32+28],@acc[7] 780 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y); 781 add %sp,LOCALS+$S,$rp 782 783 add $ap_real,64,$bp 784 add $ap_real,64,$ap 785 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z); 786 add %sp,LOCALS+$Zsqr,$rp 787 788 add $ap_real,0,$bp 789 call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x); 790 add %sp,LOCALS+$M,$rp 791 792 add %sp,LOCALS+$S,$bp 793 add %sp,LOCALS+$S,$ap 794 call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S); 795 add %sp,LOCALS+$S,$rp 796 797 ld [$ap_real],@acc[0] 798 add %sp,LOCALS+$Zsqr,$bp 799 ld [$ap_real+4],@acc[1] 800 ld [$ap_real+8],@acc[2] 801 ld [$ap_real+12],@acc[3] 802 ld [$ap_real+16],@acc[4] 803 ld [$ap_real+20],@acc[5] 804 ld [$ap_real+24],@acc[6] 805 ld [$ap_real+28],@acc[7] 806 call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr); 807 add %sp,LOCALS+$Zsqr,$rp 808 809 add $ap_real,32,$bp 810 add $ap_real,64,$ap 811 call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y); 812 add %sp,LOCALS+$tmp0,$rp 813 814 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0); 815 add $rp_real,64,$rp 816 817 add %sp,LOCALS+$Zsqr,$bp 818 add %sp,LOCALS+$M,$ap 819 call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr); 820 add %sp,LOCALS+$M,$rp 821 822 call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M); 823 add %sp,LOCALS+$M,$rp 824 825 add %sp,LOCALS+$S,$bp 826 add %sp,LOCALS+$S,$ap 827 call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S); 828 add %sp,LOCALS+$tmp0,$rp 829 830 call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0); 831 add $rp_real,32,$rp 832 833 add $ap_real,0,$bp 834 add %sp,LOCALS+$S,$ap 835 call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x); 836 add %sp,LOCALS+$S,$rp 837 838 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S); 839 add %sp,LOCALS+$tmp0,$rp 840 841 add %sp,LOCALS+$M,$bp 842 add %sp,LOCALS+$M,$ap 843 call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M); 844 add $rp_real,0,$rp 845 846 add %sp,LOCALS+$tmp0,$bp 847 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0); 848 add $rp_real,0,$rp 849 850 add %sp,LOCALS+$S,$bp 851 call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x); 852 add %sp,LOCALS+$S,$rp 853 854 add %sp,LOCALS+$M,$bp 855 add %sp,LOCALS+$S,$ap 856 call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M); 857 add %sp,LOCALS+$S,$rp 858 859 add $rp_real,32,$bp 860 call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y); 861 add $rp_real,32,$rp 862 863 ret 864 restore 865.type ecp_nistz256_point_double,#function 866.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 867___ 868} 869 870######################################################################## 871# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 872# const P256_POINT *in2); 873{ 874my ($res_x,$res_y,$res_z, 875 $H,$Hsqr,$R,$Rsqr,$Hcub, 876 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 877my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 878 879# above map() describes stack layout with 12 temporary 880# 256-bit vectors on top. Then we reserve some space for 881# !in1infty, !in2infty, result of check for zero and return pointer. 882 883my $bp_real=$rp_real; 884 885$code.=<<___; 886.globl ecp_nistz256_point_add 887.align 32 888ecp_nistz256_point_add: 889 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 890 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] 891 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 892 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) 893 be ecp_nistz256_point_add_vis3 894 nop 895 896 save %sp,-STACK_FRAME-32*12-32,%sp 897 898 stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp 899 mov $ap,$ap_real 900 mov $bp,$bp_real 901 902 ld [$bp+64],$t0 ! in2_z 903 ld [$bp+64+4],$t1 904 ld [$bp+64+8],$t2 905 ld [$bp+64+12],$t3 906 ld [$bp+64+16],$t4 907 ld [$bp+64+20],$t5 908 ld [$bp+64+24],$t6 909 ld [$bp+64+28],$t7 910 or $t1,$t0,$t0 911 or $t3,$t2,$t2 912 or $t5,$t4,$t4 913 or $t7,$t6,$t6 914 or $t2,$t0,$t0 915 or $t6,$t4,$t4 916 or $t4,$t0,$t0 ! !in2infty 917 movrnz $t0,-1,$t0 918 st $t0,[%fp+STACK_BIAS-12] 919 920 ld [$ap+64],$t0 ! in1_z 921 ld [$ap+64+4],$t1 922 ld [$ap+64+8],$t2 923 ld [$ap+64+12],$t3 924 ld [$ap+64+16],$t4 925 ld [$ap+64+20],$t5 926 ld [$ap+64+24],$t6 927 ld [$ap+64+28],$t7 928 or $t1,$t0,$t0 929 or $t3,$t2,$t2 930 or $t5,$t4,$t4 931 or $t7,$t6,$t6 932 or $t2,$t0,$t0 933 or $t6,$t4,$t4 934 or $t4,$t0,$t0 ! !in1infty 935 movrnz $t0,-1,$t0 936 st $t0,[%fp+STACK_BIAS-16] 937 938 add $bp_real,64,$bp 939 add $bp_real,64,$ap 940 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z); 941 add %sp,LOCALS+$Z2sqr,$rp 942 943 add $ap_real,64,$bp 944 add $ap_real,64,$ap 945 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); 946 add %sp,LOCALS+$Z1sqr,$rp 947 948 add $bp_real,64,$bp 949 add %sp,LOCALS+$Z2sqr,$ap 950 call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z); 951 add %sp,LOCALS+$S1,$rp 952 953 add $ap_real,64,$bp 954 add %sp,LOCALS+$Z1sqr,$ap 955 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); 956 add %sp,LOCALS+$S2,$rp 957 958 add $ap_real,32,$bp 959 add %sp,LOCALS+$S1,$ap 960 call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y); 961 add %sp,LOCALS+$S1,$rp 962 963 add $bp_real,32,$bp 964 add %sp,LOCALS+$S2,$ap 965 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); 966 add %sp,LOCALS+$S2,$rp 967 968 add %sp,LOCALS+$S1,$bp 969 call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1); 970 add %sp,LOCALS+$R,$rp 971 972 or @acc[1],@acc[0],@acc[0] ! see if result is zero 973 or @acc[3],@acc[2],@acc[2] 974 or @acc[5],@acc[4],@acc[4] 975 or @acc[7],@acc[6],@acc[6] 976 or @acc[2],@acc[0],@acc[0] 977 or @acc[6],@acc[4],@acc[4] 978 or @acc[4],@acc[0],@acc[0] 979 st @acc[0],[%fp+STACK_BIAS-20] 980 981 add $ap_real,0,$bp 982 add %sp,LOCALS+$Z2sqr,$ap 983 call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr); 984 add %sp,LOCALS+$U1,$rp 985 986 add $bp_real,0,$bp 987 add %sp,LOCALS+$Z1sqr,$ap 988 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr); 989 add %sp,LOCALS+$U2,$rp 990 991 add %sp,LOCALS+$U1,$bp 992 call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1); 993 add %sp,LOCALS+$H,$rp 994 995 or @acc[1],@acc[0],@acc[0] ! see if result is zero 996 or @acc[3],@acc[2],@acc[2] 997 or @acc[5],@acc[4],@acc[4] 998 or @acc[7],@acc[6],@acc[6] 999 or @acc[2],@acc[0],@acc[0] 1000 or @acc[6],@acc[4],@acc[4] 1001 orcc @acc[4],@acc[0],@acc[0] 1002 1003 bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)? 1004 nop 1005 1006 ld [%fp+STACK_BIAS-12],$t0 1007 ld [%fp+STACK_BIAS-16],$t1 1008 ld [%fp+STACK_BIAS-20],$t2 1009 andcc $t0,$t1,%g0 1010 be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)? 1011 nop 1012 andcc $t2,$t2,%g0 1013 be,pt %icc,.Ladd_double ! is_equal(S1,S2)? 1014 nop 1015 1016 ldx [%fp+STACK_BIAS-8],$rp 1017 st %g0,[$rp] 1018 st %g0,[$rp+4] 1019 st %g0,[$rp+8] 1020 st %g0,[$rp+12] 1021 st %g0,[$rp+16] 1022 st %g0,[$rp+20] 1023 st %g0,[$rp+24] 1024 st %g0,[$rp+28] 1025 st %g0,[$rp+32] 1026 st %g0,[$rp+32+4] 1027 st %g0,[$rp+32+8] 1028 st %g0,[$rp+32+12] 1029 st %g0,[$rp+32+16] 1030 st %g0,[$rp+32+20] 1031 st %g0,[$rp+32+24] 1032 st %g0,[$rp+32+28] 1033 st %g0,[$rp+64] 1034 st %g0,[$rp+64+4] 1035 st %g0,[$rp+64+8] 1036 st %g0,[$rp+64+12] 1037 st %g0,[$rp+64+16] 1038 st %g0,[$rp+64+20] 1039 st %g0,[$rp+64+24] 1040 st %g0,[$rp+64+28] 1041 b .Ladd_done 1042 nop 1043 1044.align 16 1045.Ladd_double: 1046 ldx [%fp+STACK_BIAS-8],$rp_real 1047 mov $ap_real,$ap 1048 b .Lpoint_double_shortcut 1049 add %sp,32*(12-4)+32,%sp ! difference in frame sizes 1050 1051.align 16 1052.Ladd_proceed: 1053 add %sp,LOCALS+$R,$bp 1054 add %sp,LOCALS+$R,$ap 1055 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); 1056 add %sp,LOCALS+$Rsqr,$rp 1057 1058 add $ap_real,64,$bp 1059 add %sp,LOCALS+$H,$ap 1060 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); 1061 add %sp,LOCALS+$res_z,$rp 1062 1063 add %sp,LOCALS+$H,$bp 1064 add %sp,LOCALS+$H,$ap 1065 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); 1066 add %sp,LOCALS+$Hsqr,$rp 1067 1068 add $bp_real,64,$bp 1069 add %sp,LOCALS+$res_z,$ap 1070 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z); 1071 add %sp,LOCALS+$res_z,$rp 1072 1073 add %sp,LOCALS+$H,$bp 1074 add %sp,LOCALS+$Hsqr,$ap 1075 call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); 1076 add %sp,LOCALS+$Hcub,$rp 1077 1078 add %sp,LOCALS+$U1,$bp 1079 add %sp,LOCALS+$Hsqr,$ap 1080 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr); 1081 add %sp,LOCALS+$U2,$rp 1082 1083 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); 1084 add %sp,LOCALS+$Hsqr,$rp 1085 1086 add %sp,LOCALS+$Rsqr,$bp 1087 call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); 1088 add %sp,LOCALS+$res_x,$rp 1089 1090 add %sp,LOCALS+$Hcub,$bp 1091 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); 1092 add %sp,LOCALS+$res_x,$rp 1093 1094 add %sp,LOCALS+$U2,$bp 1095 call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); 1096 add %sp,LOCALS+$res_y,$rp 1097 1098 add %sp,LOCALS+$Hcub,$bp 1099 add %sp,LOCALS+$S1,$ap 1100 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub); 1101 add %sp,LOCALS+$S2,$rp 1102 1103 add %sp,LOCALS+$R,$bp 1104 add %sp,LOCALS+$res_y,$ap 1105 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); 1106 add %sp,LOCALS+$res_y,$rp 1107 1108 add %sp,LOCALS+$S2,$bp 1109 call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); 1110 add %sp,LOCALS+$res_y,$rp 1111 1112 ld [%fp+STACK_BIAS-16],$t1 ! !in1infty 1113 ld [%fp+STACK_BIAS-12],$t2 ! !in2infty 1114 ldx [%fp+STACK_BIAS-8],$rp 1115___ 1116for($i=0;$i<96;$i+=8) { # conditional moves 1117$code.=<<___; 1118 ld [%sp+LOCALS+$i],@acc[0] ! res 1119 ld [%sp+LOCALS+$i+4],@acc[1] 1120 ld [$bp_real+$i],@acc[2] ! in2 1121 ld [$bp_real+$i+4],@acc[3] 1122 ld [$ap_real+$i],@acc[4] ! in1 1123 ld [$ap_real+$i+4],@acc[5] 1124 movrz $t1,@acc[2],@acc[0] 1125 movrz $t1,@acc[3],@acc[1] 1126 movrz $t2,@acc[4],@acc[0] 1127 movrz $t2,@acc[5],@acc[1] 1128 st @acc[0],[$rp+$i] 1129 st @acc[1],[$rp+$i+4] 1130___ 1131} 1132$code.=<<___; 1133.Ladd_done: 1134 ret 1135 restore 1136.type ecp_nistz256_point_add,#function 1137.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1138___ 1139} 1140 1141######################################################################## 1142# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1143# const P256_POINT_AFFINE *in2); 1144{ 1145my ($res_x,$res_y,$res_z, 1146 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1147my $Z1sqr = $S2; 1148# above map() describes stack layout with 10 temporary 1149# 256-bit vectors on top. Then we reserve some space for 1150# !in1infty, !in2infty, result of check for zero and return pointer. 1151 1152my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1153my $bp_real=$rp_real; 1154 1155$code.=<<___; 1156.globl ecp_nistz256_point_add_affine 1157.align 32 1158ecp_nistz256_point_add_affine: 1159 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 1160 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] 1161 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 1162 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) 1163 be ecp_nistz256_point_add_affine_vis3 1164 nop 1165 1166 save %sp,-STACK_FRAME-32*10-32,%sp 1167 1168 stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp 1169 mov $ap,$ap_real 1170 mov $bp,$bp_real 1171 1172 ld [$ap+64],$t0 ! in1_z 1173 ld [$ap+64+4],$t1 1174 ld [$ap+64+8],$t2 1175 ld [$ap+64+12],$t3 1176 ld [$ap+64+16],$t4 1177 ld [$ap+64+20],$t5 1178 ld [$ap+64+24],$t6 1179 ld [$ap+64+28],$t7 1180 or $t1,$t0,$t0 1181 or $t3,$t2,$t2 1182 or $t5,$t4,$t4 1183 or $t7,$t6,$t6 1184 or $t2,$t0,$t0 1185 or $t6,$t4,$t4 1186 or $t4,$t0,$t0 ! !in1infty 1187 movrnz $t0,-1,$t0 1188 st $t0,[%fp+STACK_BIAS-16] 1189 1190 ld [$bp],@acc[0] ! in2_x 1191 ld [$bp+4],@acc[1] 1192 ld [$bp+8],@acc[2] 1193 ld [$bp+12],@acc[3] 1194 ld [$bp+16],@acc[4] 1195 ld [$bp+20],@acc[5] 1196 ld [$bp+24],@acc[6] 1197 ld [$bp+28],@acc[7] 1198 ld [$bp+32],$t0 ! in2_y 1199 ld [$bp+32+4],$t1 1200 ld [$bp+32+8],$t2 1201 ld [$bp+32+12],$t3 1202 ld [$bp+32+16],$t4 1203 ld [$bp+32+20],$t5 1204 ld [$bp+32+24],$t6 1205 ld [$bp+32+28],$t7 1206 or @acc[1],@acc[0],@acc[0] 1207 or @acc[3],@acc[2],@acc[2] 1208 or @acc[5],@acc[4],@acc[4] 1209 or @acc[7],@acc[6],@acc[6] 1210 or @acc[2],@acc[0],@acc[0] 1211 or @acc[6],@acc[4],@acc[4] 1212 or @acc[4],@acc[0],@acc[0] 1213 or $t1,$t0,$t0 1214 or $t3,$t2,$t2 1215 or $t5,$t4,$t4 1216 or $t7,$t6,$t6 1217 or $t2,$t0,$t0 1218 or $t6,$t4,$t4 1219 or $t4,$t0,$t0 1220 or @acc[0],$t0,$t0 ! !in2infty 1221 movrnz $t0,-1,$t0 1222 st $t0,[%fp+STACK_BIAS-12] 1223 1224 add $ap_real,64,$bp 1225 add $ap_real,64,$ap 1226 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); 1227 add %sp,LOCALS+$Z1sqr,$rp 1228 1229 add $bp_real,0,$bp 1230 add %sp,LOCALS+$Z1sqr,$ap 1231 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x); 1232 add %sp,LOCALS+$U2,$rp 1233 1234 add $ap_real,0,$bp 1235 call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x); 1236 add %sp,LOCALS+$H,$rp 1237 1238 add $ap_real,64,$bp 1239 add %sp,LOCALS+$Z1sqr,$ap 1240 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); 1241 add %sp,LOCALS+$S2,$rp 1242 1243 add $ap_real,64,$bp 1244 add %sp,LOCALS+$H,$ap 1245 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); 1246 add %sp,LOCALS+$res_z,$rp 1247 1248 add $bp_real,32,$bp 1249 add %sp,LOCALS+$S2,$ap 1250 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); 1251 add %sp,LOCALS+$S2,$rp 1252 1253 add $ap_real,32,$bp 1254 call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y); 1255 add %sp,LOCALS+$R,$rp 1256 1257 add %sp,LOCALS+$H,$bp 1258 add %sp,LOCALS+$H,$ap 1259 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); 1260 add %sp,LOCALS+$Hsqr,$rp 1261 1262 add %sp,LOCALS+$R,$bp 1263 add %sp,LOCALS+$R,$ap 1264 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); 1265 add %sp,LOCALS+$Rsqr,$rp 1266 1267 add %sp,LOCALS+$H,$bp 1268 add %sp,LOCALS+$Hsqr,$ap 1269 call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); 1270 add %sp,LOCALS+$Hcub,$rp 1271 1272 add $ap_real,0,$bp 1273 add %sp,LOCALS+$Hsqr,$ap 1274 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr); 1275 add %sp,LOCALS+$U2,$rp 1276 1277 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); 1278 add %sp,LOCALS+$Hsqr,$rp 1279 1280 add %sp,LOCALS+$Rsqr,$bp 1281 call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); 1282 add %sp,LOCALS+$res_x,$rp 1283 1284 add %sp,LOCALS+$Hcub,$bp 1285 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); 1286 add %sp,LOCALS+$res_x,$rp 1287 1288 add %sp,LOCALS+$U2,$bp 1289 call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); 1290 add %sp,LOCALS+$res_y,$rp 1291 1292 add $ap_real,32,$bp 1293 add %sp,LOCALS+$Hcub,$ap 1294 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub); 1295 add %sp,LOCALS+$S2,$rp 1296 1297 add %sp,LOCALS+$R,$bp 1298 add %sp,LOCALS+$res_y,$ap 1299 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); 1300 add %sp,LOCALS+$res_y,$rp 1301 1302 add %sp,LOCALS+$S2,$bp 1303 call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); 1304 add %sp,LOCALS+$res_y,$rp 1305 1306 ld [%fp+STACK_BIAS-16],$t1 ! !in1infty 1307 ld [%fp+STACK_BIAS-12],$t2 ! !in2infty 1308 ldx [%fp+STACK_BIAS-8],$rp 1309___ 1310for($i=0;$i<64;$i+=8) { # conditional moves 1311$code.=<<___; 1312 ld [%sp+LOCALS+$i],@acc[0] ! res 1313 ld [%sp+LOCALS+$i+4],@acc[1] 1314 ld [$bp_real+$i],@acc[2] ! in2 1315 ld [$bp_real+$i+4],@acc[3] 1316 ld [$ap_real+$i],@acc[4] ! in1 1317 ld [$ap_real+$i+4],@acc[5] 1318 movrz $t1,@acc[2],@acc[0] 1319 movrz $t1,@acc[3],@acc[1] 1320 movrz $t2,@acc[4],@acc[0] 1321 movrz $t2,@acc[5],@acc[1] 1322 st @acc[0],[$rp+$i] 1323 st @acc[1],[$rp+$i+4] 1324___ 1325} 1326for(;$i<96;$i+=8) { 1327my $j=($i-64)/4; 1328$code.=<<___; 1329 ld [%sp+LOCALS+$i],@acc[0] ! res 1330 ld [%sp+LOCALS+$i+4],@acc[1] 1331 ld [$ap_real+$i],@acc[4] ! in1 1332 ld [$ap_real+$i+4],@acc[5] 1333 movrz $t1,@ONE_mont[$j],@acc[0] 1334 movrz $t1,@ONE_mont[$j+1],@acc[1] 1335 movrz $t2,@acc[4],@acc[0] 1336 movrz $t2,@acc[5],@acc[1] 1337 st @acc[0],[$rp+$i] 1338 st @acc[1],[$rp+$i+4] 1339___ 1340} 1341$code.=<<___; 1342 ret 1343 restore 1344.type ecp_nistz256_point_add_affine,#function 1345.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1346___ 1347} }}} 1348{{{ 1349my ($out,$inp,$index)=map("%i$_",(0..2)); 1350my $mask="%o0"; 1351 1352$code.=<<___; 1353! void ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1, 1354! int %i2); 1355.globl ecp_nistz256_scatter_w5 1356.align 32 1357ecp_nistz256_scatter_w5: 1358 save %sp,-STACK_FRAME,%sp 1359 1360 sll $index,2,$index 1361 add $out,$index,$out 1362 1363 ld [$inp],%l0 ! X 1364 ld [$inp+4],%l1 1365 ld [$inp+8],%l2 1366 ld [$inp+12],%l3 1367 ld [$inp+16],%l4 1368 ld [$inp+20],%l5 1369 ld [$inp+24],%l6 1370 ld [$inp+28],%l7 1371 add $inp,32,$inp 1372 st %l0,[$out+64*0-4] 1373 st %l1,[$out+64*1-4] 1374 st %l2,[$out+64*2-4] 1375 st %l3,[$out+64*3-4] 1376 st %l4,[$out+64*4-4] 1377 st %l5,[$out+64*5-4] 1378 st %l6,[$out+64*6-4] 1379 st %l7,[$out+64*7-4] 1380 add $out,64*8,$out 1381 1382 ld [$inp],%l0 ! Y 1383 ld [$inp+4],%l1 1384 ld [$inp+8],%l2 1385 ld [$inp+12],%l3 1386 ld [$inp+16],%l4 1387 ld [$inp+20],%l5 1388 ld [$inp+24],%l6 1389 ld [$inp+28],%l7 1390 add $inp,32,$inp 1391 st %l0,[$out+64*0-4] 1392 st %l1,[$out+64*1-4] 1393 st %l2,[$out+64*2-4] 1394 st %l3,[$out+64*3-4] 1395 st %l4,[$out+64*4-4] 1396 st %l5,[$out+64*5-4] 1397 st %l6,[$out+64*6-4] 1398 st %l7,[$out+64*7-4] 1399 add $out,64*8,$out 1400 1401 ld [$inp],%l0 ! Z 1402 ld [$inp+4],%l1 1403 ld [$inp+8],%l2 1404 ld [$inp+12],%l3 1405 ld [$inp+16],%l4 1406 ld [$inp+20],%l5 1407 ld [$inp+24],%l6 1408 ld [$inp+28],%l7 1409 st %l0,[$out+64*0-4] 1410 st %l1,[$out+64*1-4] 1411 st %l2,[$out+64*2-4] 1412 st %l3,[$out+64*3-4] 1413 st %l4,[$out+64*4-4] 1414 st %l5,[$out+64*5-4] 1415 st %l6,[$out+64*6-4] 1416 st %l7,[$out+64*7-4] 1417 1418 ret 1419 restore 1420.type ecp_nistz256_scatter_w5,#function 1421.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1422 1423! void ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1, 1424! int %i2); 1425.globl ecp_nistz256_gather_w5 1426.align 32 1427ecp_nistz256_gather_w5: 1428 save %sp,-STACK_FRAME,%sp 1429 1430 neg $index,$mask 1431 srax $mask,63,$mask 1432 1433 add $index,$mask,$index 1434 sll $index,2,$index 1435 add $inp,$index,$inp 1436 1437 ld [$inp+64*0],%l0 1438 ld [$inp+64*1],%l1 1439 ld [$inp+64*2],%l2 1440 ld [$inp+64*3],%l3 1441 ld [$inp+64*4],%l4 1442 ld [$inp+64*5],%l5 1443 ld [$inp+64*6],%l6 1444 ld [$inp+64*7],%l7 1445 add $inp,64*8,$inp 1446 and %l0,$mask,%l0 1447 and %l1,$mask,%l1 1448 st %l0,[$out] ! X 1449 and %l2,$mask,%l2 1450 st %l1,[$out+4] 1451 and %l3,$mask,%l3 1452 st %l2,[$out+8] 1453 and %l4,$mask,%l4 1454 st %l3,[$out+12] 1455 and %l5,$mask,%l5 1456 st %l4,[$out+16] 1457 and %l6,$mask,%l6 1458 st %l5,[$out+20] 1459 and %l7,$mask,%l7 1460 st %l6,[$out+24] 1461 st %l7,[$out+28] 1462 add $out,32,$out 1463 1464 ld [$inp+64*0],%l0 1465 ld [$inp+64*1],%l1 1466 ld [$inp+64*2],%l2 1467 ld [$inp+64*3],%l3 1468 ld [$inp+64*4],%l4 1469 ld [$inp+64*5],%l5 1470 ld [$inp+64*6],%l6 1471 ld [$inp+64*7],%l7 1472 add $inp,64*8,$inp 1473 and %l0,$mask,%l0 1474 and %l1,$mask,%l1 1475 st %l0,[$out] ! Y 1476 and %l2,$mask,%l2 1477 st %l1,[$out+4] 1478 and %l3,$mask,%l3 1479 st %l2,[$out+8] 1480 and %l4,$mask,%l4 1481 st %l3,[$out+12] 1482 and %l5,$mask,%l5 1483 st %l4,[$out+16] 1484 and %l6,$mask,%l6 1485 st %l5,[$out+20] 1486 and %l7,$mask,%l7 1487 st %l6,[$out+24] 1488 st %l7,[$out+28] 1489 add $out,32,$out 1490 1491 ld [$inp+64*0],%l0 1492 ld [$inp+64*1],%l1 1493 ld [$inp+64*2],%l2 1494 ld [$inp+64*3],%l3 1495 ld [$inp+64*4],%l4 1496 ld [$inp+64*5],%l5 1497 ld [$inp+64*6],%l6 1498 ld [$inp+64*7],%l7 1499 and %l0,$mask,%l0 1500 and %l1,$mask,%l1 1501 st %l0,[$out] ! Z 1502 and %l2,$mask,%l2 1503 st %l1,[$out+4] 1504 and %l3,$mask,%l3 1505 st %l2,[$out+8] 1506 and %l4,$mask,%l4 1507 st %l3,[$out+12] 1508 and %l5,$mask,%l5 1509 st %l4,[$out+16] 1510 and %l6,$mask,%l6 1511 st %l5,[$out+20] 1512 and %l7,$mask,%l7 1513 st %l6,[$out+24] 1514 st %l7,[$out+28] 1515 1516 ret 1517 restore 1518.type ecp_nistz256_gather_w5,#function 1519.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1520 1521! void ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1, 1522! int %i2); 1523.globl ecp_nistz256_scatter_w7 1524.align 32 1525ecp_nistz256_scatter_w7: 1526 save %sp,-STACK_FRAME,%sp 1527 nop 1528 add $out,$index,$out 1529 mov 64/4,$index 1530.Loop_scatter_w7: 1531 ld [$inp],%l0 1532 add $inp,4,$inp 1533 subcc $index,1,$index 1534 stb %l0,[$out+64*0] 1535 srl %l0,8,%l1 1536 stb %l1,[$out+64*1] 1537 srl %l0,16,%l2 1538 stb %l2,[$out+64*2] 1539 srl %l0,24,%l3 1540 stb %l3,[$out+64*3] 1541 bne .Loop_scatter_w7 1542 add $out,64*4,$out 1543 1544 ret 1545 restore 1546.type ecp_nistz256_scatter_w7,#function 1547.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1548 1549! void ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1, 1550! int %i2); 1551.globl ecp_nistz256_gather_w7 1552.align 32 1553ecp_nistz256_gather_w7: 1554 save %sp,-STACK_FRAME,%sp 1555 1556 neg $index,$mask 1557 srax $mask,63,$mask 1558 1559 add $index,$mask,$index 1560 add $inp,$index,$inp 1561 mov 64/4,$index 1562 1563.Loop_gather_w7: 1564 ldub [$inp+64*0],%l0 1565 prefetch [$inp+3840+64*0],1 1566 subcc $index,1,$index 1567 ldub [$inp+64*1],%l1 1568 prefetch [$inp+3840+64*1],1 1569 ldub [$inp+64*2],%l2 1570 prefetch [$inp+3840+64*2],1 1571 ldub [$inp+64*3],%l3 1572 prefetch [$inp+3840+64*3],1 1573 add $inp,64*4,$inp 1574 sll %l1,8,%l1 1575 sll %l2,16,%l2 1576 or %l0,%l1,%l0 1577 sll %l3,24,%l3 1578 or %l0,%l2,%l0 1579 or %l0,%l3,%l0 1580 and %l0,$mask,%l0 1581 st %l0,[$out] 1582 bne .Loop_gather_w7 1583 add $out,4,$out 1584 1585 ret 1586 restore 1587.type ecp_nistz256_gather_w7,#function 1588.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1589___ 1590}}} 1591{{{ 1592######################################################################## 1593# Following subroutines are VIS3 counterparts of those above that 1594# implement ones found in ecp_nistz256.c. Key difference is that they 1595# use 128-bit multiplication and addition with 64-bit carry, and in order 1596# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon 1597# entry and vice versa on return. 1598# 1599my ($rp,$ap,$bp)=map("%i$_",(0..2)); 1600my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7)); 1601my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5)); 1602my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1"); 1603my ($rp_real,$ap_real)=("%g2","%g3"); 1604my ($acc6,$acc7)=($bp,$bi); # used in squaring 1605 1606$code.=<<___; 1607.align 32 1608__ecp_nistz256_mul_by_2_vis3: 1609 addcc $acc0,$acc0,$acc0 1610 addxccc $acc1,$acc1,$acc1 1611 addxccc $acc2,$acc2,$acc2 1612 addxccc $acc3,$acc3,$acc3 1613 b .Lreduce_by_sub_vis3 1614 addxc %g0,%g0,$acc4 ! did it carry? 1615.type __ecp_nistz256_mul_by_2_vis3,#function 1616.size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3 1617 1618.align 32 1619__ecp_nistz256_add_vis3: 1620 ldx [$bp+0],$t0 1621 ldx [$bp+8],$t1 1622 ldx [$bp+16],$t2 1623 ldx [$bp+24],$t3 1624 1625__ecp_nistz256_add_noload_vis3: 1626 1627 addcc $t0,$acc0,$acc0 1628 addxccc $t1,$acc1,$acc1 1629 addxccc $t2,$acc2,$acc2 1630 addxccc $t3,$acc3,$acc3 1631 addxc %g0,%g0,$acc4 ! did it carry? 1632 1633.Lreduce_by_sub_vis3: 1634 1635 addcc $acc0,1,$t0 ! add -modulus, i.e. subtract 1636 addxccc $acc1,$poly1,$t1 1637 addxccc $acc2,$minus1,$t2 1638 addxccc $acc3,$poly3,$t3 1639 addxc $acc4,$minus1,$acc4 1640 1641 movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus 1642 movrz $acc4,$t1,$acc1 1643 stx $acc0,[$rp] 1644 movrz $acc4,$t2,$acc2 1645 stx $acc1,[$rp+8] 1646 movrz $acc4,$t3,$acc3 1647 stx $acc2,[$rp+16] 1648 retl 1649 stx $acc3,[$rp+24] 1650.type __ecp_nistz256_add_vis3,#function 1651.size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3 1652 1653! Trouble with subtraction is that there is no subtraction with 64-bit 1654! borrow, only with 32-bit one. For this reason we "decompose" 64-bit 1655! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But 1656! recall that SPARC is big-endian, which is why you'll observe that 1657! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we 1658! "collect" result back to 64-bit $acc0-$acc3. 1659.align 32 1660__ecp_nistz256_sub_from_vis3: 1661 ld [$bp+4],$t0 1662 ld [$bp+0],$t1 1663 ld [$bp+12],$t2 1664 ld [$bp+8],$t3 1665 1666 srlx $acc0,32,$acc4 1667 not $poly1,$poly1 1668 srlx $acc1,32,$acc5 1669 subcc $acc0,$t0,$acc0 1670 ld [$bp+20],$t0 1671 subccc $acc4,$t1,$acc4 1672 ld [$bp+16],$t1 1673 subccc $acc1,$t2,$acc1 1674 ld [$bp+28],$t2 1675 and $acc0,$poly1,$acc0 1676 subccc $acc5,$t3,$acc5 1677 ld [$bp+24],$t3 1678 sllx $acc4,32,$acc4 1679 and $acc1,$poly1,$acc1 1680 sllx $acc5,32,$acc5 1681 or $acc0,$acc4,$acc0 1682 srlx $acc2,32,$acc4 1683 or $acc1,$acc5,$acc1 1684 srlx $acc3,32,$acc5 1685 subccc $acc2,$t0,$acc2 1686 subccc $acc4,$t1,$acc4 1687 subccc $acc3,$t2,$acc3 1688 and $acc2,$poly1,$acc2 1689 subccc $acc5,$t3,$acc5 1690 sllx $acc4,32,$acc4 1691 and $acc3,$poly1,$acc3 1692 sllx $acc5,32,$acc5 1693 or $acc2,$acc4,$acc2 1694 subc %g0,%g0,$acc4 ! did it borrow? 1695 b .Lreduce_by_add_vis3 1696 or $acc3,$acc5,$acc3 1697.type __ecp_nistz256_sub_from_vis3,#function 1698.size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3 1699 1700.align 32 1701__ecp_nistz256_sub_morf_vis3: 1702 ld [$bp+4],$t0 1703 ld [$bp+0],$t1 1704 ld [$bp+12],$t2 1705 ld [$bp+8],$t3 1706 1707 srlx $acc0,32,$acc4 1708 not $poly1,$poly1 1709 srlx $acc1,32,$acc5 1710 subcc $t0,$acc0,$acc0 1711 ld [$bp+20],$t0 1712 subccc $t1,$acc4,$acc4 1713 ld [$bp+16],$t1 1714 subccc $t2,$acc1,$acc1 1715 ld [$bp+28],$t2 1716 and $acc0,$poly1,$acc0 1717 subccc $t3,$acc5,$acc5 1718 ld [$bp+24],$t3 1719 sllx $acc4,32,$acc4 1720 and $acc1,$poly1,$acc1 1721 sllx $acc5,32,$acc5 1722 or $acc0,$acc4,$acc0 1723 srlx $acc2,32,$acc4 1724 or $acc1,$acc5,$acc1 1725 srlx $acc3,32,$acc5 1726 subccc $t0,$acc2,$acc2 1727 subccc $t1,$acc4,$acc4 1728 subccc $t2,$acc3,$acc3 1729 and $acc2,$poly1,$acc2 1730 subccc $t3,$acc5,$acc5 1731 sllx $acc4,32,$acc4 1732 and $acc3,$poly1,$acc3 1733 sllx $acc5,32,$acc5 1734 or $acc2,$acc4,$acc2 1735 subc %g0,%g0,$acc4 ! did it borrow? 1736 or $acc3,$acc5,$acc3 1737 1738.Lreduce_by_add_vis3: 1739 1740 addcc $acc0,-1,$t0 ! add modulus 1741 not $poly3,$t3 1742 addxccc $acc1,$poly1,$t1 1743 not $poly1,$poly1 ! restore $poly1 1744 addxccc $acc2,%g0,$t2 1745 addxc $acc3,$t3,$t3 1746 1747 movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod 1748 movrnz $acc4,$t1,$acc1 1749 stx $acc0,[$rp] 1750 movrnz $acc4,$t2,$acc2 1751 stx $acc1,[$rp+8] 1752 movrnz $acc4,$t3,$acc3 1753 stx $acc2,[$rp+16] 1754 retl 1755 stx $acc3,[$rp+24] 1756.type __ecp_nistz256_sub_morf_vis3,#function 1757.size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3 1758 1759.align 32 1760__ecp_nistz256_div_by_2_vis3: 1761 ! ret = (a is odd ? a+mod : a) >> 1 1762 1763 not $poly1,$t1 1764 not $poly3,$t3 1765 and $acc0,1,$acc5 1766 addcc $acc0,-1,$t0 ! add modulus 1767 addxccc $acc1,$t1,$t1 1768 addxccc $acc2,%g0,$t2 1769 addxccc $acc3,$t3,$t3 1770 addxc %g0,%g0,$acc4 ! carry bit 1771 1772 movrnz $acc5,$t0,$acc0 1773 movrnz $acc5,$t1,$acc1 1774 movrnz $acc5,$t2,$acc2 1775 movrnz $acc5,$t3,$acc3 1776 movrz $acc5,%g0,$acc4 1777 1778 ! ret >>= 1 1779 1780 srlx $acc0,1,$acc0 1781 sllx $acc1,63,$t0 1782 srlx $acc1,1,$acc1 1783 or $acc0,$t0,$acc0 1784 sllx $acc2,63,$t1 1785 srlx $acc2,1,$acc2 1786 or $acc1,$t1,$acc1 1787 sllx $acc3,63,$t2 1788 stx $acc0,[$rp] 1789 srlx $acc3,1,$acc3 1790 or $acc2,$t2,$acc2 1791 sllx $acc4,63,$t3 ! don't forget carry bit 1792 stx $acc1,[$rp+8] 1793 or $acc3,$t3,$acc3 1794 stx $acc2,[$rp+16] 1795 retl 1796 stx $acc3,[$rp+24] 1797.type __ecp_nistz256_div_by_2_vis3,#function 1798.size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3 1799 1800! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and 1801! 4x faster [on T4]... 1802.align 32 1803__ecp_nistz256_mul_mont_vis3: 1804 mulx $a0,$bi,$acc0 1805 not $poly3,$poly3 ! 0xFFFFFFFF00000001 1806 umulxhi $a0,$bi,$t0 1807 mulx $a1,$bi,$acc1 1808 umulxhi $a1,$bi,$t1 1809 mulx $a2,$bi,$acc2 1810 umulxhi $a2,$bi,$t2 1811 mulx $a3,$bi,$acc3 1812 umulxhi $a3,$bi,$t3 1813 ldx [$bp+8],$bi ! b[1] 1814 1815 addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication 1816 sllx $acc0,32,$t0 1817 addxccc $acc2,$t1,$acc2 1818 srlx $acc0,32,$t1 1819 addxccc $acc3,$t2,$acc3 1820 addxc %g0,$t3,$acc4 1821 mov 0,$acc5 1822___ 1823for($i=1;$i<4;$i++) { 1824 # Reduction iteration is normally performed by accumulating 1825 # result of multiplication of modulus by "magic" digit [and 1826 # omitting least significant word, which is guaranteed to 1827 # be 0], but thanks to special form of modulus and "magic" 1828 # digit being equal to least significant word, it can be 1829 # performed with additions and subtractions alone. Indeed: 1830 # 1831 # ffff0001.00000000.0000ffff.ffffffff 1832 # * abcdefgh 1833 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 1834 # 1835 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1836 # rewrite above as: 1837 # 1838 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 1839 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 1840 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 1841 # 1842 # or marking redundant operations: 1843 # 1844 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 1845 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 1846 # - 0000abcd.efgh0000.--------.--------.-------- 1847 # ^^^^^^^^ but this word is calculated with umulxhi, because 1848 # there is no subtract with 64-bit borrow:-( 1849 1850$code.=<<___; 1851 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1852 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1853 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1854 mulx $a0,$bi,$t0 1855 addxccc $acc2,$t1,$acc1 1856 mulx $a1,$bi,$t1 1857 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1858 mulx $a2,$bi,$t2 1859 addxccc $acc4,$t3,$acc3 1860 mulx $a3,$bi,$t3 1861 addxc $acc5,%g0,$acc4 1862 1863 addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication 1864 umulxhi $a0,$bi,$t0 1865 addxccc $acc1,$t1,$acc1 1866 umulxhi $a1,$bi,$t1 1867 addxccc $acc2,$t2,$acc2 1868 umulxhi $a2,$bi,$t2 1869 addxccc $acc3,$t3,$acc3 1870 umulxhi $a3,$bi,$t3 1871 addxc $acc4,%g0,$acc4 1872___ 1873$code.=<<___ if ($i<3); 1874 ldx [$bp+8*($i+1)],$bi ! bp[$i+1] 1875___ 1876$code.=<<___; 1877 addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication 1878 sllx $acc0,32,$t0 1879 addxccc $acc2,$t1,$acc2 1880 srlx $acc0,32,$t1 1881 addxccc $acc3,$t2,$acc3 1882 addxccc $acc4,$t3,$acc4 1883 addxc %g0,%g0,$acc5 1884___ 1885} 1886$code.=<<___; 1887 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1888 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1889 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1890 addxccc $acc2,$t1,$acc1 1891 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1892 addxccc $acc4,$t3,$acc3 1893 b .Lmul_final_vis3 ! see below 1894 addxc $acc5,%g0,$acc4 1895.type __ecp_nistz256_mul_mont_vis3,#function 1896.size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3 1897 1898! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less 1899! instructions, but only 14% faster [on T4]... 1900.align 32 1901__ecp_nistz256_sqr_mont_vis3: 1902 ! | | | | | |a1*a0| | 1903 ! | | | | |a2*a0| | | 1904 ! | |a3*a2|a3*a0| | | | 1905 ! | | | |a2*a1| | | | 1906 ! | | |a3*a1| | | | | 1907 ! *| | | | | | | | 2| 1908 ! +|a3*a3|a2*a2|a1*a1|a0*a0| 1909 ! |--+--+--+--+--+--+--+--| 1910 ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1911 ! 1912 ! "can't overflow" below mark carrying into high part of 1913 ! multiplication result, which can't overflow, because it 1914 ! can never be all ones. 1915 1916 mulx $a1,$a0,$acc1 ! a[1]*a[0] 1917 umulxhi $a1,$a0,$t1 1918 mulx $a2,$a0,$acc2 ! a[2]*a[0] 1919 umulxhi $a2,$a0,$t2 1920 mulx $a3,$a0,$acc3 ! a[3]*a[0] 1921 umulxhi $a3,$a0,$acc4 1922 1923 addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication 1924 mulx $a2,$a1,$t0 ! a[2]*a[1] 1925 umulxhi $a2,$a1,$t1 1926 addxccc $acc3,$t2,$acc3 1927 mulx $a3,$a1,$t2 ! a[3]*a[1] 1928 umulxhi $a3,$a1,$t3 1929 addxc $acc4,%g0,$acc4 ! can't overflow 1930 1931 mulx $a3,$a2,$acc5 ! a[3]*a[2] 1932 not $poly3,$poly3 ! 0xFFFFFFFF00000001 1933 umulxhi $a3,$a2,$acc6 1934 1935 addcc $t2,$t1,$t1 ! accumulate high parts of multiplication 1936 mulx $a0,$a0,$acc0 ! a[0]*a[0] 1937 addxc $t3,%g0,$t2 ! can't overflow 1938 1939 addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication 1940 umulxhi $a0,$a0,$a0 1941 addxccc $acc4,$t1,$acc4 1942 mulx $a1,$a1,$t1 ! a[1]*a[1] 1943 addxccc $acc5,$t2,$acc5 1944 umulxhi $a1,$a1,$a1 1945 addxc $acc6,%g0,$acc6 ! can't overflow 1946 1947 addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2 1948 mulx $a2,$a2,$t2 ! a[2]*a[2] 1949 addxccc $acc2,$acc2,$acc2 1950 umulxhi $a2,$a2,$a2 1951 addxccc $acc3,$acc3,$acc3 1952 mulx $a3,$a3,$t3 ! a[3]*a[3] 1953 addxccc $acc4,$acc4,$acc4 1954 umulxhi $a3,$a3,$a3 1955 addxccc $acc5,$acc5,$acc5 1956 addxccc $acc6,$acc6,$acc6 1957 addxc %g0,%g0,$acc7 1958 1959 addcc $acc1,$a0,$acc1 ! +a[i]*a[i] 1960 addxccc $acc2,$t1,$acc2 1961 addxccc $acc3,$a1,$acc3 1962 addxccc $acc4,$t2,$acc4 1963 sllx $acc0,32,$t0 1964 addxccc $acc5,$a2,$acc5 1965 srlx $acc0,32,$t1 1966 addxccc $acc6,$t3,$acc6 1967 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1968 addxc $acc7,$a3,$acc7 1969___ 1970for($i=0;$i<3;$i++) { # reductions, see commentary 1971 # in multiplication for details 1972$code.=<<___; 1973 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1974 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1975 sllx $acc0,32,$t0 1976 addxccc $acc2,$t1,$acc1 1977 srlx $acc0,32,$t1 1978 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1979 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1980 addxc %g0,$t3,$acc3 ! can't overflow 1981___ 1982} 1983$code.=<<___; 1984 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1985 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1986 addxccc $acc2,$t1,$acc1 1987 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1988 addxc %g0,$t3,$acc3 ! can't overflow 1989 1990 addcc $acc0,$acc4,$acc0 ! accumulate upper half 1991 addxccc $acc1,$acc5,$acc1 1992 addxccc $acc2,$acc6,$acc2 1993 addxccc $acc3,$acc7,$acc3 1994 addxc %g0,%g0,$acc4 1995 1996.Lmul_final_vis3: 1997 1998 ! Final step is "if result > mod, subtract mod", but as comparison 1999 ! means subtraction, we do the subtraction and then copy outcome 2000 ! if it didn't borrow. But note that as we [have to] replace 2001 ! subtraction with addition with negative, carry/borrow logic is 2002 ! inverse. 2003 2004 addcc $acc0,1,$t0 ! add -modulus, i.e. subtract 2005 not $poly3,$poly3 ! restore 0x00000000FFFFFFFE 2006 addxccc $acc1,$poly1,$t1 2007 addxccc $acc2,$minus1,$t2 2008 addxccc $acc3,$poly3,$t3 2009 addxccc $acc4,$minus1,%g0 ! did it carry? 2010 2011 movcs %xcc,$t0,$acc0 2012 movcs %xcc,$t1,$acc1 2013 stx $acc0,[$rp] 2014 movcs %xcc,$t2,$acc2 2015 stx $acc1,[$rp+8] 2016 movcs %xcc,$t3,$acc3 2017 stx $acc2,[$rp+16] 2018 retl 2019 stx $acc3,[$rp+24] 2020.type __ecp_nistz256_sqr_mont_vis3,#function 2021.size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3 2022___ 2023 2024######################################################################## 2025# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 2026# 2027{ 2028my ($res_x,$res_y,$res_z, 2029 $in_x,$in_y,$in_z, 2030 $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9)); 2031# above map() describes stack layout with 10 temporary 2032# 256-bit vectors on top. 2033 2034$code.=<<___; 2035.align 32 2036ecp_nistz256_point_double_vis3: 2037 save %sp,-STACK64_FRAME-32*10,%sp 2038 2039 mov $rp,$rp_real 2040.Ldouble_shortcut_vis3: 2041 mov -1,$minus1 2042 mov -2,$poly3 2043 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 2044 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE 2045 2046 ! convert input to uint64_t[4] 2047 ld [$ap],$a0 ! in_x 2048 ld [$ap+4],$t0 2049 ld [$ap+8],$a1 2050 ld [$ap+12],$t1 2051 ld [$ap+16],$a2 2052 ld [$ap+20],$t2 2053 ld [$ap+24],$a3 2054 ld [$ap+28],$t3 2055 sllx $t0,32,$t0 2056 sllx $t1,32,$t1 2057 ld [$ap+32],$acc0 ! in_y 2058 or $a0,$t0,$a0 2059 ld [$ap+32+4],$t0 2060 sllx $t2,32,$t2 2061 ld [$ap+32+8],$acc1 2062 or $a1,$t1,$a1 2063 ld [$ap+32+12],$t1 2064 sllx $t3,32,$t3 2065 ld [$ap+32+16],$acc2 2066 or $a2,$t2,$a2 2067 ld [$ap+32+20],$t2 2068 or $a3,$t3,$a3 2069 ld [$ap+32+24],$acc3 2070 sllx $t0,32,$t0 2071 ld [$ap+32+28],$t3 2072 sllx $t1,32,$t1 2073 stx $a0,[%sp+LOCALS64+$in_x] 2074 sllx $t2,32,$t2 2075 stx $a1,[%sp+LOCALS64+$in_x+8] 2076 sllx $t3,32,$t3 2077 stx $a2,[%sp+LOCALS64+$in_x+16] 2078 or $acc0,$t0,$acc0 2079 stx $a3,[%sp+LOCALS64+$in_x+24] 2080 or $acc1,$t1,$acc1 2081 stx $acc0,[%sp+LOCALS64+$in_y] 2082 or $acc2,$t2,$acc2 2083 stx $acc1,[%sp+LOCALS64+$in_y+8] 2084 or $acc3,$t3,$acc3 2085 stx $acc2,[%sp+LOCALS64+$in_y+16] 2086 stx $acc3,[%sp+LOCALS64+$in_y+24] 2087 2088 ld [$ap+64],$a0 ! in_z 2089 ld [$ap+64+4],$t0 2090 ld [$ap+64+8],$a1 2091 ld [$ap+64+12],$t1 2092 ld [$ap+64+16],$a2 2093 ld [$ap+64+20],$t2 2094 ld [$ap+64+24],$a3 2095 ld [$ap+64+28],$t3 2096 sllx $t0,32,$t0 2097 sllx $t1,32,$t1 2098 or $a0,$t0,$a0 2099 sllx $t2,32,$t2 2100 or $a1,$t1,$a1 2101 sllx $t3,32,$t3 2102 or $a2,$t2,$a2 2103 or $a3,$t3,$a3 2104 sllx $t0,32,$t0 2105 sllx $t1,32,$t1 2106 stx $a0,[%sp+LOCALS64+$in_z] 2107 sllx $t2,32,$t2 2108 stx $a1,[%sp+LOCALS64+$in_z+8] 2109 sllx $t3,32,$t3 2110 stx $a2,[%sp+LOCALS64+$in_z+16] 2111 stx $a3,[%sp+LOCALS64+$in_z+24] 2112 2113 ! in_y is still in $acc0-$acc3 2114 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y); 2115 add %sp,LOCALS64+$S,$rp 2116 2117 ! in_z is still in $a0-$a3 2118 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z); 2119 add %sp,LOCALS64+$Zsqr,$rp 2120 2121 mov $acc0,$a0 ! put Zsqr aside 2122 mov $acc1,$a1 2123 mov $acc2,$a2 2124 mov $acc3,$a3 2125 2126 add %sp,LOCALS64+$in_x,$bp 2127 call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x); 2128 add %sp,LOCALS64+$M,$rp 2129 2130 mov $a0,$acc0 ! restore Zsqr 2131 ldx [%sp+LOCALS64+$S],$a0 ! forward load 2132 mov $a1,$acc1 2133 ldx [%sp+LOCALS64+$S+8],$a1 2134 mov $a2,$acc2 2135 ldx [%sp+LOCALS64+$S+16],$a2 2136 mov $a3,$acc3 2137 ldx [%sp+LOCALS64+$S+24],$a3 2138 2139 add %sp,LOCALS64+$in_x,$bp 2140 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr); 2141 add %sp,LOCALS64+$Zsqr,$rp 2142 2143 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S); 2144 add %sp,LOCALS64+$S,$rp 2145 2146 ldx [%sp+LOCALS64+$in_z],$bi 2147 ldx [%sp+LOCALS64+$in_y],$a0 2148 ldx [%sp+LOCALS64+$in_y+8],$a1 2149 ldx [%sp+LOCALS64+$in_y+16],$a2 2150 ldx [%sp+LOCALS64+$in_y+24],$a3 2151 add %sp,LOCALS64+$in_z,$bp 2152 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y); 2153 add %sp,LOCALS64+$tmp0,$rp 2154 2155 ldx [%sp+LOCALS64+$M],$bi ! forward load 2156 ldx [%sp+LOCALS64+$Zsqr],$a0 2157 ldx [%sp+LOCALS64+$Zsqr+8],$a1 2158 ldx [%sp+LOCALS64+$Zsqr+16],$a2 2159 ldx [%sp+LOCALS64+$Zsqr+24],$a3 2160 2161 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0); 2162 add %sp,LOCALS64+$res_z,$rp 2163 2164 add %sp,LOCALS64+$M,$bp 2165 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr); 2166 add %sp,LOCALS64+$M,$rp 2167 2168 mov $acc0,$a0 ! put aside M 2169 mov $acc1,$a1 2170 mov $acc2,$a2 2171 mov $acc3,$a3 2172 call __ecp_nistz256_mul_by_2_vis3 2173 add %sp,LOCALS64+$M,$rp 2174 mov $a0,$t0 ! copy M 2175 ldx [%sp+LOCALS64+$S],$a0 ! forward load 2176 mov $a1,$t1 2177 ldx [%sp+LOCALS64+$S+8],$a1 2178 mov $a2,$t2 2179 ldx [%sp+LOCALS64+$S+16],$a2 2180 mov $a3,$t3 2181 ldx [%sp+LOCALS64+$S+24],$a3 2182 call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M); 2183 add %sp,LOCALS64+$M,$rp 2184 2185 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S); 2186 add %sp,LOCALS64+$tmp0,$rp 2187 2188 ldx [%sp+LOCALS64+$S],$bi ! forward load 2189 ldx [%sp+LOCALS64+$in_x],$a0 2190 ldx [%sp+LOCALS64+$in_x+8],$a1 2191 ldx [%sp+LOCALS64+$in_x+16],$a2 2192 ldx [%sp+LOCALS64+$in_x+24],$a3 2193 2194 call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0); 2195 add %sp,LOCALS64+$res_y,$rp 2196 2197 add %sp,LOCALS64+$S,$bp 2198 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x); 2199 add %sp,LOCALS64+$S,$rp 2200 2201 ldx [%sp+LOCALS64+$M],$a0 ! forward load 2202 ldx [%sp+LOCALS64+$M+8],$a1 2203 ldx [%sp+LOCALS64+$M+16],$a2 2204 ldx [%sp+LOCALS64+$M+24],$a3 2205 2206 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S); 2207 add %sp,LOCALS64+$tmp0,$rp 2208 2209 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M); 2210 add %sp,LOCALS64+$res_x,$rp 2211 2212 add %sp,LOCALS64+$tmp0,$bp 2213 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0); 2214 add %sp,LOCALS64+$res_x,$rp 2215 2216 ldx [%sp+LOCALS64+$M],$a0 ! forward load 2217 ldx [%sp+LOCALS64+$M+8],$a1 2218 ldx [%sp+LOCALS64+$M+16],$a2 2219 ldx [%sp+LOCALS64+$M+24],$a3 2220 2221 add %sp,LOCALS64+$S,$bp 2222 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x); 2223 add %sp,LOCALS64+$S,$rp 2224 2225 mov $acc0,$bi 2226 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M); 2227 add %sp,LOCALS64+$S,$rp 2228 2229 ldx [%sp+LOCALS64+$res_x],$a0 ! forward load 2230 ldx [%sp+LOCALS64+$res_x+8],$a1 2231 ldx [%sp+LOCALS64+$res_x+16],$a2 2232 ldx [%sp+LOCALS64+$res_x+24],$a3 2233 2234 add %sp,LOCALS64+$res_y,$bp 2235 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y); 2236 add %sp,LOCALS64+$res_y,$bp 2237 2238 ! convert output to uint_32[8] 2239 srlx $a0,32,$t0 2240 srlx $a1,32,$t1 2241 st $a0,[$rp_real] ! res_x 2242 srlx $a2,32,$t2 2243 st $t0,[$rp_real+4] 2244 srlx $a3,32,$t3 2245 st $a1,[$rp_real+8] 2246 st $t1,[$rp_real+12] 2247 st $a2,[$rp_real+16] 2248 st $t2,[$rp_real+20] 2249 st $a3,[$rp_real+24] 2250 st $t3,[$rp_real+28] 2251 2252 ldx [%sp+LOCALS64+$res_z],$a0 ! forward load 2253 srlx $acc0,32,$t0 2254 ldx [%sp+LOCALS64+$res_z+8],$a1 2255 srlx $acc1,32,$t1 2256 ldx [%sp+LOCALS64+$res_z+16],$a2 2257 srlx $acc2,32,$t2 2258 ldx [%sp+LOCALS64+$res_z+24],$a3 2259 srlx $acc3,32,$t3 2260 st $acc0,[$rp_real+32] ! res_y 2261 st $t0, [$rp_real+32+4] 2262 st $acc1,[$rp_real+32+8] 2263 st $t1, [$rp_real+32+12] 2264 st $acc2,[$rp_real+32+16] 2265 st $t2, [$rp_real+32+20] 2266 st $acc3,[$rp_real+32+24] 2267 st $t3, [$rp_real+32+28] 2268 2269 srlx $a0,32,$t0 2270 srlx $a1,32,$t1 2271 st $a0,[$rp_real+64] ! res_z 2272 srlx $a2,32,$t2 2273 st $t0,[$rp_real+64+4] 2274 srlx $a3,32,$t3 2275 st $a1,[$rp_real+64+8] 2276 st $t1,[$rp_real+64+12] 2277 st $a2,[$rp_real+64+16] 2278 st $t2,[$rp_real+64+20] 2279 st $a3,[$rp_real+64+24] 2280 st $t3,[$rp_real+64+28] 2281 2282 ret 2283 restore 2284.type ecp_nistz256_point_double_vis3,#function 2285.size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3 2286___ 2287} 2288######################################################################## 2289# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 2290# const P256_POINT *in2); 2291{ 2292my ($res_x,$res_y,$res_z, 2293 $in1_x,$in1_y,$in1_z, 2294 $in2_x,$in2_y,$in2_z, 2295 $H,$Hsqr,$R,$Rsqr,$Hcub, 2296 $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 2297my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2298 2299# above map() describes stack layout with 18 temporary 2300# 256-bit vectors on top. Then we reserve some space for 2301# !in1infty, !in2infty and result of check for zero. 2302 2303$code.=<<___; 2304.align 32 2305ecp_nistz256_point_add_vis3: 2306 save %sp,-STACK64_FRAME-32*18-32,%sp 2307 2308 mov $rp,$rp_real 2309 mov -1,$minus1 2310 mov -2,$poly3 2311 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 2312 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE 2313 2314 ! convert input to uint64_t[4] 2315 ld [$bp],$a0 ! in2_x 2316 ld [$bp+4],$t0 2317 ld [$bp+8],$a1 2318 ld [$bp+12],$t1 2319 ld [$bp+16],$a2 2320 ld [$bp+20],$t2 2321 ld [$bp+24],$a3 2322 ld [$bp+28],$t3 2323 sllx $t0,32,$t0 2324 sllx $t1,32,$t1 2325 ld [$bp+32],$acc0 ! in2_y 2326 or $a0,$t0,$a0 2327 ld [$bp+32+4],$t0 2328 sllx $t2,32,$t2 2329 ld [$bp+32+8],$acc1 2330 or $a1,$t1,$a1 2331 ld [$bp+32+12],$t1 2332 sllx $t3,32,$t3 2333 ld [$bp+32+16],$acc2 2334 or $a2,$t2,$a2 2335 ld [$bp+32+20],$t2 2336 or $a3,$t3,$a3 2337 ld [$bp+32+24],$acc3 2338 sllx $t0,32,$t0 2339 ld [$bp+32+28],$t3 2340 sllx $t1,32,$t1 2341 stx $a0,[%sp+LOCALS64+$in2_x] 2342 sllx $t2,32,$t2 2343 stx $a1,[%sp+LOCALS64+$in2_x+8] 2344 sllx $t3,32,$t3 2345 stx $a2,[%sp+LOCALS64+$in2_x+16] 2346 or $acc0,$t0,$acc0 2347 stx $a3,[%sp+LOCALS64+$in2_x+24] 2348 or $acc1,$t1,$acc1 2349 stx $acc0,[%sp+LOCALS64+$in2_y] 2350 or $acc2,$t2,$acc2 2351 stx $acc1,[%sp+LOCALS64+$in2_y+8] 2352 or $acc3,$t3,$acc3 2353 stx $acc2,[%sp+LOCALS64+$in2_y+16] 2354 stx $acc3,[%sp+LOCALS64+$in2_y+24] 2355 2356 ld [$bp+64],$acc0 ! in2_z 2357 ld [$bp+64+4],$t0 2358 ld [$bp+64+8],$acc1 2359 ld [$bp+64+12],$t1 2360 ld [$bp+64+16],$acc2 2361 ld [$bp+64+20],$t2 2362 ld [$bp+64+24],$acc3 2363 ld [$bp+64+28],$t3 2364 sllx $t0,32,$t0 2365 sllx $t1,32,$t1 2366 ld [$ap],$a0 ! in1_x 2367 or $acc0,$t0,$acc0 2368 ld [$ap+4],$t0 2369 sllx $t2,32,$t2 2370 ld [$ap+8],$a1 2371 or $acc1,$t1,$acc1 2372 ld [$ap+12],$t1 2373 sllx $t3,32,$t3 2374 ld [$ap+16],$a2 2375 or $acc2,$t2,$acc2 2376 ld [$ap+20],$t2 2377 or $acc3,$t3,$acc3 2378 ld [$ap+24],$a3 2379 sllx $t0,32,$t0 2380 ld [$ap+28],$t3 2381 sllx $t1,32,$t1 2382 stx $acc0,[%sp+LOCALS64+$in2_z] 2383 sllx $t2,32,$t2 2384 stx $acc1,[%sp+LOCALS64+$in2_z+8] 2385 sllx $t3,32,$t3 2386 stx $acc2,[%sp+LOCALS64+$in2_z+16] 2387 stx $acc3,[%sp+LOCALS64+$in2_z+24] 2388 2389 or $acc1,$acc0,$acc0 2390 or $acc3,$acc2,$acc2 2391 or $acc2,$acc0,$acc0 2392 movrnz $acc0,-1,$acc0 ! !in2infty 2393 stx $acc0,[%fp+STACK_BIAS-8] 2394 2395 or $a0,$t0,$a0 2396 ld [$ap+32],$acc0 ! in1_y 2397 or $a1,$t1,$a1 2398 ld [$ap+32+4],$t0 2399 or $a2,$t2,$a2 2400 ld [$ap+32+8],$acc1 2401 or $a3,$t3,$a3 2402 ld [$ap+32+12],$t1 2403 ld [$ap+32+16],$acc2 2404 ld [$ap+32+20],$t2 2405 ld [$ap+32+24],$acc3 2406 sllx $t0,32,$t0 2407 ld [$ap+32+28],$t3 2408 sllx $t1,32,$t1 2409 stx $a0,[%sp+LOCALS64+$in1_x] 2410 sllx $t2,32,$t2 2411 stx $a1,[%sp+LOCALS64+$in1_x+8] 2412 sllx $t3,32,$t3 2413 stx $a2,[%sp+LOCALS64+$in1_x+16] 2414 or $acc0,$t0,$acc0 2415 stx $a3,[%sp+LOCALS64+$in1_x+24] 2416 or $acc1,$t1,$acc1 2417 stx $acc0,[%sp+LOCALS64+$in1_y] 2418 or $acc2,$t2,$acc2 2419 stx $acc1,[%sp+LOCALS64+$in1_y+8] 2420 or $acc3,$t3,$acc3 2421 stx $acc2,[%sp+LOCALS64+$in1_y+16] 2422 stx $acc3,[%sp+LOCALS64+$in1_y+24] 2423 2424 ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load 2425 ldx [%sp+LOCALS64+$in2_z+8],$a1 2426 ldx [%sp+LOCALS64+$in2_z+16],$a2 2427 ldx [%sp+LOCALS64+$in2_z+24],$a3 2428 2429 ld [$ap+64],$acc0 ! in1_z 2430 ld [$ap+64+4],$t0 2431 ld [$ap+64+8],$acc1 2432 ld [$ap+64+12],$t1 2433 ld [$ap+64+16],$acc2 2434 ld [$ap+64+20],$t2 2435 ld [$ap+64+24],$acc3 2436 ld [$ap+64+28],$t3 2437 sllx $t0,32,$t0 2438 sllx $t1,32,$t1 2439 or $acc0,$t0,$acc0 2440 sllx $t2,32,$t2 2441 or $acc1,$t1,$acc1 2442 sllx $t3,32,$t3 2443 stx $acc0,[%sp+LOCALS64+$in1_z] 2444 or $acc2,$t2,$acc2 2445 stx $acc1,[%sp+LOCALS64+$in1_z+8] 2446 or $acc3,$t3,$acc3 2447 stx $acc2,[%sp+LOCALS64+$in1_z+16] 2448 stx $acc3,[%sp+LOCALS64+$in1_z+24] 2449 2450 or $acc1,$acc0,$acc0 2451 or $acc3,$acc2,$acc2 2452 or $acc2,$acc0,$acc0 2453 movrnz $acc0,-1,$acc0 ! !in1infty 2454 stx $acc0,[%fp+STACK_BIAS-16] 2455 2456 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z); 2457 add %sp,LOCALS64+$Z2sqr,$rp 2458 2459 ldx [%sp+LOCALS64+$in1_z],$a0 2460 ldx [%sp+LOCALS64+$in1_z+8],$a1 2461 ldx [%sp+LOCALS64+$in1_z+16],$a2 2462 ldx [%sp+LOCALS64+$in1_z+24],$a3 2463 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); 2464 add %sp,LOCALS64+$Z1sqr,$rp 2465 2466 ldx [%sp+LOCALS64+$Z2sqr],$bi 2467 ldx [%sp+LOCALS64+$in2_z],$a0 2468 ldx [%sp+LOCALS64+$in2_z+8],$a1 2469 ldx [%sp+LOCALS64+$in2_z+16],$a2 2470 ldx [%sp+LOCALS64+$in2_z+24],$a3 2471 add %sp,LOCALS64+$Z2sqr,$bp 2472 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z); 2473 add %sp,LOCALS64+$S1,$rp 2474 2475 ldx [%sp+LOCALS64+$Z1sqr],$bi 2476 ldx [%sp+LOCALS64+$in1_z],$a0 2477 ldx [%sp+LOCALS64+$in1_z+8],$a1 2478 ldx [%sp+LOCALS64+$in1_z+16],$a2 2479 ldx [%sp+LOCALS64+$in1_z+24],$a3 2480 add %sp,LOCALS64+$Z1sqr,$bp 2481 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); 2482 add %sp,LOCALS64+$S2,$rp 2483 2484 ldx [%sp+LOCALS64+$S1],$bi 2485 ldx [%sp+LOCALS64+$in1_y],$a0 2486 ldx [%sp+LOCALS64+$in1_y+8],$a1 2487 ldx [%sp+LOCALS64+$in1_y+16],$a2 2488 ldx [%sp+LOCALS64+$in1_y+24],$a3 2489 add %sp,LOCALS64+$S1,$bp 2490 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y); 2491 add %sp,LOCALS64+$S1,$rp 2492 2493 ldx [%sp+LOCALS64+$S2],$bi 2494 ldx [%sp+LOCALS64+$in2_y],$a0 2495 ldx [%sp+LOCALS64+$in2_y+8],$a1 2496 ldx [%sp+LOCALS64+$in2_y+16],$a2 2497 ldx [%sp+LOCALS64+$in2_y+24],$a3 2498 add %sp,LOCALS64+$S2,$bp 2499 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); 2500 add %sp,LOCALS64+$S2,$rp 2501 2502 ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load 2503 ldx [%sp+LOCALS64+$in1_x],$a0 2504 ldx [%sp+LOCALS64+$in1_x+8],$a1 2505 ldx [%sp+LOCALS64+$in1_x+16],$a2 2506 ldx [%sp+LOCALS64+$in1_x+24],$a3 2507 2508 add %sp,LOCALS64+$S1,$bp 2509 call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1); 2510 add %sp,LOCALS64+$R,$rp 2511 2512 or $acc1,$acc0,$acc0 ! see if result is zero 2513 or $acc3,$acc2,$acc2 2514 or $acc2,$acc0,$acc0 2515 stx $acc0,[%fp+STACK_BIAS-24] 2516 2517 add %sp,LOCALS64+$Z2sqr,$bp 2518 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr); 2519 add %sp,LOCALS64+$U1,$rp 2520 2521 ldx [%sp+LOCALS64+$Z1sqr],$bi 2522 ldx [%sp+LOCALS64+$in2_x],$a0 2523 ldx [%sp+LOCALS64+$in2_x+8],$a1 2524 ldx [%sp+LOCALS64+$in2_x+16],$a2 2525 ldx [%sp+LOCALS64+$in2_x+24],$a3 2526 add %sp,LOCALS64+$Z1sqr,$bp 2527 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr); 2528 add %sp,LOCALS64+$U2,$rp 2529 2530 ldx [%sp+LOCALS64+$R],$a0 ! forward load 2531 ldx [%sp+LOCALS64+$R+8],$a1 2532 ldx [%sp+LOCALS64+$R+16],$a2 2533 ldx [%sp+LOCALS64+$R+24],$a3 2534 2535 add %sp,LOCALS64+$U1,$bp 2536 call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1); 2537 add %sp,LOCALS64+$H,$rp 2538 2539 or $acc1,$acc0,$acc0 ! see if result is zero 2540 or $acc3,$acc2,$acc2 2541 orcc $acc2,$acc0,$acc0 2542 2543 bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)? 2544 nop 2545 2546 ldx [%fp+STACK_BIAS-8],$t0 2547 ldx [%fp+STACK_BIAS-16],$t1 2548 ldx [%fp+STACK_BIAS-24],$t2 2549 andcc $t0,$t1,%g0 2550 be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)? 2551 nop 2552 andcc $t2,$t2,%g0 2553 be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)? 2554 add %sp,32*(12-10)+32,%sp ! difference in frame sizes 2555 2556 st %g0,[$rp_real] 2557 st %g0,[$rp_real+4] 2558 st %g0,[$rp_real+8] 2559 st %g0,[$rp_real+12] 2560 st %g0,[$rp_real+16] 2561 st %g0,[$rp_real+20] 2562 st %g0,[$rp_real+24] 2563 st %g0,[$rp_real+28] 2564 st %g0,[$rp_real+32] 2565 st %g0,[$rp_real+32+4] 2566 st %g0,[$rp_real+32+8] 2567 st %g0,[$rp_real+32+12] 2568 st %g0,[$rp_real+32+16] 2569 st %g0,[$rp_real+32+20] 2570 st %g0,[$rp_real+32+24] 2571 st %g0,[$rp_real+32+28] 2572 st %g0,[$rp_real+64] 2573 st %g0,[$rp_real+64+4] 2574 st %g0,[$rp_real+64+8] 2575 st %g0,[$rp_real+64+12] 2576 st %g0,[$rp_real+64+16] 2577 st %g0,[$rp_real+64+20] 2578 st %g0,[$rp_real+64+24] 2579 st %g0,[$rp_real+64+28] 2580 b .Ladd_done_vis3 2581 nop 2582 2583.align 16 2584.Ladd_proceed_vis3: 2585 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); 2586 add %sp,LOCALS64+$Rsqr,$rp 2587 2588 ldx [%sp+LOCALS64+$H],$bi 2589 ldx [%sp+LOCALS64+$in1_z],$a0 2590 ldx [%sp+LOCALS64+$in1_z+8],$a1 2591 ldx [%sp+LOCALS64+$in1_z+16],$a2 2592 ldx [%sp+LOCALS64+$in1_z+24],$a3 2593 add %sp,LOCALS64+$H,$bp 2594 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); 2595 add %sp,LOCALS64+$res_z,$rp 2596 2597 ldx [%sp+LOCALS64+$H],$a0 2598 ldx [%sp+LOCALS64+$H+8],$a1 2599 ldx [%sp+LOCALS64+$H+16],$a2 2600 ldx [%sp+LOCALS64+$H+24],$a3 2601 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); 2602 add %sp,LOCALS64+$Hsqr,$rp 2603 2604 ldx [%sp+LOCALS64+$res_z],$bi 2605 ldx [%sp+LOCALS64+$in2_z],$a0 2606 ldx [%sp+LOCALS64+$in2_z+8],$a1 2607 ldx [%sp+LOCALS64+$in2_z+16],$a2 2608 ldx [%sp+LOCALS64+$in2_z+24],$a3 2609 add %sp,LOCALS64+$res_z,$bp 2610 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z); 2611 add %sp,LOCALS64+$res_z,$rp 2612 2613 ldx [%sp+LOCALS64+$H],$bi 2614 ldx [%sp+LOCALS64+$Hsqr],$a0 2615 ldx [%sp+LOCALS64+$Hsqr+8],$a1 2616 ldx [%sp+LOCALS64+$Hsqr+16],$a2 2617 ldx [%sp+LOCALS64+$Hsqr+24],$a3 2618 add %sp,LOCALS64+$H,$bp 2619 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); 2620 add %sp,LOCALS64+$Hcub,$rp 2621 2622 ldx [%sp+LOCALS64+$U1],$bi 2623 ldx [%sp+LOCALS64+$Hsqr],$a0 2624 ldx [%sp+LOCALS64+$Hsqr+8],$a1 2625 ldx [%sp+LOCALS64+$Hsqr+16],$a2 2626 ldx [%sp+LOCALS64+$Hsqr+24],$a3 2627 add %sp,LOCALS64+$U1,$bp 2628 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr); 2629 add %sp,LOCALS64+$U2,$rp 2630 2631 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); 2632 add %sp,LOCALS64+$Hsqr,$rp 2633 2634 add %sp,LOCALS64+$Rsqr,$bp 2635 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); 2636 add %sp,LOCALS64+$res_x,$rp 2637 2638 add %sp,LOCALS64+$Hcub,$bp 2639 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); 2640 add %sp,LOCALS64+$res_x,$rp 2641 2642 ldx [%sp+LOCALS64+$S1],$bi ! forward load 2643 ldx [%sp+LOCALS64+$Hcub],$a0 2644 ldx [%sp+LOCALS64+$Hcub+8],$a1 2645 ldx [%sp+LOCALS64+$Hcub+16],$a2 2646 ldx [%sp+LOCALS64+$Hcub+24],$a3 2647 2648 add %sp,LOCALS64+$U2,$bp 2649 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); 2650 add %sp,LOCALS64+$res_y,$rp 2651 2652 add %sp,LOCALS64+$S1,$bp 2653 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub); 2654 add %sp,LOCALS64+$S2,$rp 2655 2656 ldx [%sp+LOCALS64+$R],$bi 2657 ldx [%sp+LOCALS64+$res_y],$a0 2658 ldx [%sp+LOCALS64+$res_y+8],$a1 2659 ldx [%sp+LOCALS64+$res_y+16],$a2 2660 ldx [%sp+LOCALS64+$res_y+24],$a3 2661 add %sp,LOCALS64+$R,$bp 2662 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); 2663 add %sp,LOCALS64+$res_y,$rp 2664 2665 add %sp,LOCALS64+$S2,$bp 2666 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); 2667 add %sp,LOCALS64+$res_y,$rp 2668 2669 ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty 2670 ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty 2671___ 2672for($i=0;$i<96;$i+=16) { # conditional moves 2673$code.=<<___; 2674 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res 2675 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 2676 ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 2677 ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 2678 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 2679 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 2680 movrz $t1,$acc2,$acc0 2681 movrz $t1,$acc3,$acc1 2682 movrz $t2,$acc4,$acc0 2683 movrz $t2,$acc5,$acc1 2684 srlx $acc0,32,$acc2 2685 srlx $acc1,32,$acc3 2686 st $acc0,[$rp_real+$i] 2687 st $acc2,[$rp_real+$i+4] 2688 st $acc1,[$rp_real+$i+8] 2689 st $acc3,[$rp_real+$i+12] 2690___ 2691} 2692$code.=<<___; 2693.Ladd_done_vis3: 2694 ret 2695 restore 2696.type ecp_nistz256_point_add_vis3,#function 2697.size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3 2698___ 2699} 2700######################################################################## 2701# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 2702# const P256_POINT_AFFINE *in2); 2703{ 2704my ($res_x,$res_y,$res_z, 2705 $in1_x,$in1_y,$in1_z, 2706 $in2_x,$in2_y, 2707 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 2708my $Z1sqr = $S2; 2709# above map() describes stack layout with 15 temporary 2710# 256-bit vectors on top. Then we reserve some space for 2711# !in1infty and !in2infty. 2712 2713$code.=<<___; 2714.align 32 2715ecp_nistz256_point_add_affine_vis3: 2716 save %sp,-STACK64_FRAME-32*15-32,%sp 2717 2718 mov $rp,$rp_real 2719 mov -1,$minus1 2720 mov -2,$poly3 2721 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 2722 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE 2723 2724 ! convert input to uint64_t[4] 2725 ld [$bp],$a0 ! in2_x 2726 ld [$bp+4],$t0 2727 ld [$bp+8],$a1 2728 ld [$bp+12],$t1 2729 ld [$bp+16],$a2 2730 ld [$bp+20],$t2 2731 ld [$bp+24],$a3 2732 ld [$bp+28],$t3 2733 sllx $t0,32,$t0 2734 sllx $t1,32,$t1 2735 ld [$bp+32],$acc0 ! in2_y 2736 or $a0,$t0,$a0 2737 ld [$bp+32+4],$t0 2738 sllx $t2,32,$t2 2739 ld [$bp+32+8],$acc1 2740 or $a1,$t1,$a1 2741 ld [$bp+32+12],$t1 2742 sllx $t3,32,$t3 2743 ld [$bp+32+16],$acc2 2744 or $a2,$t2,$a2 2745 ld [$bp+32+20],$t2 2746 or $a3,$t3,$a3 2747 ld [$bp+32+24],$acc3 2748 sllx $t0,32,$t0 2749 ld [$bp+32+28],$t3 2750 sllx $t1,32,$t1 2751 stx $a0,[%sp+LOCALS64+$in2_x] 2752 sllx $t2,32,$t2 2753 stx $a1,[%sp+LOCALS64+$in2_x+8] 2754 sllx $t3,32,$t3 2755 stx $a2,[%sp+LOCALS64+$in2_x+16] 2756 or $acc0,$t0,$acc0 2757 stx $a3,[%sp+LOCALS64+$in2_x+24] 2758 or $acc1,$t1,$acc1 2759 stx $acc0,[%sp+LOCALS64+$in2_y] 2760 or $acc2,$t2,$acc2 2761 stx $acc1,[%sp+LOCALS64+$in2_y+8] 2762 or $acc3,$t3,$acc3 2763 stx $acc2,[%sp+LOCALS64+$in2_y+16] 2764 stx $acc3,[%sp+LOCALS64+$in2_y+24] 2765 2766 or $a1,$a0,$a0 2767 or $a3,$a2,$a2 2768 or $acc1,$acc0,$acc0 2769 or $acc3,$acc2,$acc2 2770 or $a2,$a0,$a0 2771 or $acc2,$acc0,$acc0 2772 or $acc0,$a0,$a0 2773 movrnz $a0,-1,$a0 ! !in2infty 2774 stx $a0,[%fp+STACK_BIAS-8] 2775 2776 ld [$ap],$a0 ! in1_x 2777 ld [$ap+4],$t0 2778 ld [$ap+8],$a1 2779 ld [$ap+12],$t1 2780 ld [$ap+16],$a2 2781 ld [$ap+20],$t2 2782 ld [$ap+24],$a3 2783 ld [$ap+28],$t3 2784 sllx $t0,32,$t0 2785 sllx $t1,32,$t1 2786 ld [$ap+32],$acc0 ! in1_y 2787 or $a0,$t0,$a0 2788 ld [$ap+32+4],$t0 2789 sllx $t2,32,$t2 2790 ld [$ap+32+8],$acc1 2791 or $a1,$t1,$a1 2792 ld [$ap+32+12],$t1 2793 sllx $t3,32,$t3 2794 ld [$ap+32+16],$acc2 2795 or $a2,$t2,$a2 2796 ld [$ap+32+20],$t2 2797 or $a3,$t3,$a3 2798 ld [$ap+32+24],$acc3 2799 sllx $t0,32,$t0 2800 ld [$ap+32+28],$t3 2801 sllx $t1,32,$t1 2802 stx $a0,[%sp+LOCALS64+$in1_x] 2803 sllx $t2,32,$t2 2804 stx $a1,[%sp+LOCALS64+$in1_x+8] 2805 sllx $t3,32,$t3 2806 stx $a2,[%sp+LOCALS64+$in1_x+16] 2807 or $acc0,$t0,$acc0 2808 stx $a3,[%sp+LOCALS64+$in1_x+24] 2809 or $acc1,$t1,$acc1 2810 stx $acc0,[%sp+LOCALS64+$in1_y] 2811 or $acc2,$t2,$acc2 2812 stx $acc1,[%sp+LOCALS64+$in1_y+8] 2813 or $acc3,$t3,$acc3 2814 stx $acc2,[%sp+LOCALS64+$in1_y+16] 2815 stx $acc3,[%sp+LOCALS64+$in1_y+24] 2816 2817 ld [$ap+64],$a0 ! in1_z 2818 ld [$ap+64+4],$t0 2819 ld [$ap+64+8],$a1 2820 ld [$ap+64+12],$t1 2821 ld [$ap+64+16],$a2 2822 ld [$ap+64+20],$t2 2823 ld [$ap+64+24],$a3 2824 ld [$ap+64+28],$t3 2825 sllx $t0,32,$t0 2826 sllx $t1,32,$t1 2827 or $a0,$t0,$a0 2828 sllx $t2,32,$t2 2829 or $a1,$t1,$a1 2830 sllx $t3,32,$t3 2831 stx $a0,[%sp+LOCALS64+$in1_z] 2832 or $a2,$t2,$a2 2833 stx $a1,[%sp+LOCALS64+$in1_z+8] 2834 or $a3,$t3,$a3 2835 stx $a2,[%sp+LOCALS64+$in1_z+16] 2836 stx $a3,[%sp+LOCALS64+$in1_z+24] 2837 2838 or $a1,$a0,$t0 2839 or $a3,$a2,$t2 2840 or $t2,$t0,$t0 2841 movrnz $t0,-1,$t0 ! !in1infty 2842 stx $t0,[%fp+STACK_BIAS-16] 2843 2844 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); 2845 add %sp,LOCALS64+$Z1sqr,$rp 2846 2847 ldx [%sp+LOCALS64+$in2_x],$bi 2848 mov $acc0,$a0 2849 mov $acc1,$a1 2850 mov $acc2,$a2 2851 mov $acc3,$a3 2852 add %sp,LOCALS64+$in2_x,$bp 2853 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x); 2854 add %sp,LOCALS64+$U2,$rp 2855 2856 ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load 2857 ldx [%sp+LOCALS64+$in1_z],$a0 2858 ldx [%sp+LOCALS64+$in1_z+8],$a1 2859 ldx [%sp+LOCALS64+$in1_z+16],$a2 2860 ldx [%sp+LOCALS64+$in1_z+24],$a3 2861 2862 add %sp,LOCALS64+$in1_x,$bp 2863 call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x); 2864 add %sp,LOCALS64+$H,$rp 2865 2866 add %sp,LOCALS64+$Z1sqr,$bp 2867 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); 2868 add %sp,LOCALS64+$S2,$rp 2869 2870 ldx [%sp+LOCALS64+$H],$bi 2871 ldx [%sp+LOCALS64+$in1_z],$a0 2872 ldx [%sp+LOCALS64+$in1_z+8],$a1 2873 ldx [%sp+LOCALS64+$in1_z+16],$a2 2874 ldx [%sp+LOCALS64+$in1_z+24],$a3 2875 add %sp,LOCALS64+$H,$bp 2876 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); 2877 add %sp,LOCALS64+$res_z,$rp 2878 2879 ldx [%sp+LOCALS64+$S2],$bi 2880 ldx [%sp+LOCALS64+$in2_y],$a0 2881 ldx [%sp+LOCALS64+$in2_y+8],$a1 2882 ldx [%sp+LOCALS64+$in2_y+16],$a2 2883 ldx [%sp+LOCALS64+$in2_y+24],$a3 2884 add %sp,LOCALS64+$S2,$bp 2885 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); 2886 add %sp,LOCALS64+$S2,$rp 2887 2888 ldx [%sp+LOCALS64+$H],$a0 ! forward load 2889 ldx [%sp+LOCALS64+$H+8],$a1 2890 ldx [%sp+LOCALS64+$H+16],$a2 2891 ldx [%sp+LOCALS64+$H+24],$a3 2892 2893 add %sp,LOCALS64+$in1_y,$bp 2894 call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y); 2895 add %sp,LOCALS64+$R,$rp 2896 2897 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); 2898 add %sp,LOCALS64+$Hsqr,$rp 2899 2900 ldx [%sp+LOCALS64+$R],$a0 2901 ldx [%sp+LOCALS64+$R+8],$a1 2902 ldx [%sp+LOCALS64+$R+16],$a2 2903 ldx [%sp+LOCALS64+$R+24],$a3 2904 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); 2905 add %sp,LOCALS64+$Rsqr,$rp 2906 2907 ldx [%sp+LOCALS64+$H],$bi 2908 ldx [%sp+LOCALS64+$Hsqr],$a0 2909 ldx [%sp+LOCALS64+$Hsqr+8],$a1 2910 ldx [%sp+LOCALS64+$Hsqr+16],$a2 2911 ldx [%sp+LOCALS64+$Hsqr+24],$a3 2912 add %sp,LOCALS64+$H,$bp 2913 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); 2914 add %sp,LOCALS64+$Hcub,$rp 2915 2916 ldx [%sp+LOCALS64+$Hsqr],$bi 2917 ldx [%sp+LOCALS64+$in1_x],$a0 2918 ldx [%sp+LOCALS64+$in1_x+8],$a1 2919 ldx [%sp+LOCALS64+$in1_x+16],$a2 2920 ldx [%sp+LOCALS64+$in1_x+24],$a3 2921 add %sp,LOCALS64+$Hsqr,$bp 2922 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr); 2923 add %sp,LOCALS64+$U2,$rp 2924 2925 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); 2926 add %sp,LOCALS64+$Hsqr,$rp 2927 2928 add %sp,LOCALS64+$Rsqr,$bp 2929 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); 2930 add %sp,LOCALS64+$res_x,$rp 2931 2932 add %sp,LOCALS64+$Hcub,$bp 2933 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); 2934 add %sp,LOCALS64+$res_x,$rp 2935 2936 ldx [%sp+LOCALS64+$Hcub],$bi ! forward load 2937 ldx [%sp+LOCALS64+$in1_y],$a0 2938 ldx [%sp+LOCALS64+$in1_y+8],$a1 2939 ldx [%sp+LOCALS64+$in1_y+16],$a2 2940 ldx [%sp+LOCALS64+$in1_y+24],$a3 2941 2942 add %sp,LOCALS64+$U2,$bp 2943 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); 2944 add %sp,LOCALS64+$res_y,$rp 2945 2946 add %sp,LOCALS64+$Hcub,$bp 2947 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub); 2948 add %sp,LOCALS64+$S2,$rp 2949 2950 ldx [%sp+LOCALS64+$R],$bi 2951 ldx [%sp+LOCALS64+$res_y],$a0 2952 ldx [%sp+LOCALS64+$res_y+8],$a1 2953 ldx [%sp+LOCALS64+$res_y+16],$a2 2954 ldx [%sp+LOCALS64+$res_y+24],$a3 2955 add %sp,LOCALS64+$R,$bp 2956 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); 2957 add %sp,LOCALS64+$res_y,$rp 2958 2959 add %sp,LOCALS64+$S2,$bp 2960 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); 2961 add %sp,LOCALS64+$res_y,$rp 2962 2963 ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty 2964 ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty 29651: call .+8 2966 add %o7,.Lone_mont_vis3-1b,$bp 2967___ 2968for($i=0;$i<64;$i+=16) { # conditional moves 2969$code.=<<___; 2970 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res 2971 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 2972 ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 2973 ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 2974 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 2975 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 2976 movrz $t1,$acc2,$acc0 2977 movrz $t1,$acc3,$acc1 2978 movrz $t2,$acc4,$acc0 2979 movrz $t2,$acc5,$acc1 2980 srlx $acc0,32,$acc2 2981 srlx $acc1,32,$acc3 2982 st $acc0,[$rp_real+$i] 2983 st $acc2,[$rp_real+$i+4] 2984 st $acc1,[$rp_real+$i+8] 2985 st $acc3,[$rp_real+$i+12] 2986___ 2987} 2988for(;$i<96;$i+=16) { 2989$code.=<<___; 2990 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res 2991 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 2992 ldx [$bp+$i-64],$acc2 ! "in2" 2993 ldx [$bp+$i-64+8],$acc3 2994 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 2995 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 2996 movrz $t1,$acc2,$acc0 2997 movrz $t1,$acc3,$acc1 2998 movrz $t2,$acc4,$acc0 2999 movrz $t2,$acc5,$acc1 3000 srlx $acc0,32,$acc2 3001 srlx $acc1,32,$acc3 3002 st $acc0,[$rp_real+$i] 3003 st $acc2,[$rp_real+$i+4] 3004 st $acc1,[$rp_real+$i+8] 3005 st $acc3,[$rp_real+$i+12] 3006___ 3007} 3008$code.=<<___; 3009 ret 3010 restore 3011.type ecp_nistz256_point_add_affine_vis3,#function 3012.size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3 3013.align 64 3014.Lone_mont_vis3: 3015.long 0x00000000,0x00000001, 0xffffffff,0x00000000 3016.long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe 3017.align 64 3018___ 3019} }}} 3020 3021# Purpose of these subroutines is to explicitly encode VIS instructions, 3022# so that one can compile the module without having to specify VIS 3023# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 3024# Idea is to reserve for option to produce "universal" binary and let 3025# programmer detect if current CPU is VIS capable at run-time. 3026sub unvis3 { 3027my ($mnemonic,$rs1,$rs2,$rd)=@_; 3028my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 3029my ($ref,$opf); 3030my %visopf = ( "addxc" => 0x011, 3031 "addxccc" => 0x013, 3032 "umulxhi" => 0x016 ); 3033 3034 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 3035 3036 if ($opf=$visopf{$mnemonic}) { 3037 foreach ($rs1,$rs2,$rd) { 3038 return $ref if (!/%([goli])([0-9])/); 3039 $_=$bias{$1}+$2; 3040 } 3041 3042 return sprintf ".word\t0x%08x !%s", 3043 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 3044 $ref; 3045 } else { 3046 return $ref; 3047 } 3048} 3049 3050foreach (split("\n",$code)) { 3051 s/\`([^\`]*)\`/eval $1/ge; 3052 3053 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 3054 &unvis3($1,$2,$3,$4) 3055 /ge; 3056 3057 print $_,"\n"; 3058} 3059 3060close STDOUT or die "error closing STDOUT: $!"; 3061