1#! /usr/bin/env perl 2# Copyright 2015-2019 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34$flavour = shift; 35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 40die "can't locate arm-xlate.pl"; 41 42open OUT,"| \"$^X\" $xlate $flavour $output"; 43*STDOUT=*OUT; 44 45{ 46my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 48 map("x$_",(0..17,19,20)); 49 50my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 51 52$code.=<<___; 53#include "arm_arch.h" 54 55.text 56___ 57######################################################################## 58# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 59# 60$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 61open TABLE,"<ecp_nistz256_table.c" or 62open TABLE,"<${dir}../ecp_nistz256_table.c" or 63die "failed to open ecp_nistz256_table.c:",$!; 64 65use integer; 66 67foreach(<TABLE>) { 68 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 69} 70close TABLE; 71 72# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 73# 64*16*37-1 is because $#arr returns last valid index or @arr, not 74# amount of elements. 75die "insane number of elements" if ($#arr != 64*16*37-1); 76 77$code.=<<___; 78.globl ecp_nistz256_precomputed 79.type ecp_nistz256_precomputed,%object 80.align 12 81ecp_nistz256_precomputed: 82___ 83######################################################################## 84# this conversion smashes P256_POINT_AFFINE by individual bytes with 85# 64 byte interval, similar to 86# 1111222233334444 87# 1234123412341234 88for(1..37) { 89 @tbl = splice(@arr,0,64*16); 90 for($i=0;$i<64;$i++) { 91 undef @line; 92 for($j=0;$j<64;$j++) { 93 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 94 } 95 $code.=".byte\t"; 96 $code.=join(',',map { sprintf "0x%02x",$_} @line); 97 $code.="\n"; 98 } 99} 100$code.=<<___; 101.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 102.align 5 103.Lpoly: 104.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 105.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 106.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 107.Lone_mont: 108.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 109.Lone: 110.quad 1,0,0,0 111.Lord: 112.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 113.LordK: 114.quad 0xccd1c8aaee00bc4f 115.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 116 117// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 118.globl ecp_nistz256_to_mont 119.type ecp_nistz256_to_mont,%function 120.align 6 121ecp_nistz256_to_mont: 122 .inst 0xd503233f // paciasp 123 stp x29,x30,[sp,#-32]! 124 add x29,sp,#0 125 stp x19,x20,[sp,#16] 126 127 ldr $bi,.LRR // bp[0] 128 ldp $a0,$a1,[$ap] 129 ldp $a2,$a3,[$ap,#16] 130 ldr $poly1,.Lpoly+8 131 ldr $poly3,.Lpoly+24 132 adr $bp,.LRR // &bp[0] 133 134 bl __ecp_nistz256_mul_mont 135 136 ldp x19,x20,[sp,#16] 137 ldp x29,x30,[sp],#32 138 .inst 0xd50323bf // autiasp 139 ret 140.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 141 142// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 143.globl ecp_nistz256_from_mont 144.type ecp_nistz256_from_mont,%function 145.align 4 146ecp_nistz256_from_mont: 147 .inst 0xd503233f // paciasp 148 stp x29,x30,[sp,#-32]! 149 add x29,sp,#0 150 stp x19,x20,[sp,#16] 151 152 mov $bi,#1 // bp[0] 153 ldp $a0,$a1,[$ap] 154 ldp $a2,$a3,[$ap,#16] 155 ldr $poly1,.Lpoly+8 156 ldr $poly3,.Lpoly+24 157 adr $bp,.Lone // &bp[0] 158 159 bl __ecp_nistz256_mul_mont 160 161 ldp x19,x20,[sp,#16] 162 ldp x29,x30,[sp],#32 163 .inst 0xd50323bf // autiasp 164 ret 165.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 166 167// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 168// const BN_ULONG x2[4]); 169.globl ecp_nistz256_mul_mont 170.type ecp_nistz256_mul_mont,%function 171.align 4 172ecp_nistz256_mul_mont: 173 .inst 0xd503233f // paciasp 174 stp x29,x30,[sp,#-32]! 175 add x29,sp,#0 176 stp x19,x20,[sp,#16] 177 178 ldr $bi,[$bp] // bp[0] 179 ldp $a0,$a1,[$ap] 180 ldp $a2,$a3,[$ap,#16] 181 ldr $poly1,.Lpoly+8 182 ldr $poly3,.Lpoly+24 183 184 bl __ecp_nistz256_mul_mont 185 186 ldp x19,x20,[sp,#16] 187 ldp x29,x30,[sp],#32 188 .inst 0xd50323bf // autiasp 189 ret 190.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 191 192// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 193.globl ecp_nistz256_sqr_mont 194.type ecp_nistz256_sqr_mont,%function 195.align 4 196ecp_nistz256_sqr_mont: 197 .inst 0xd503233f // paciasp 198 stp x29,x30,[sp,#-32]! 199 add x29,sp,#0 200 stp x19,x20,[sp,#16] 201 202 ldp $a0,$a1,[$ap] 203 ldp $a2,$a3,[$ap,#16] 204 ldr $poly1,.Lpoly+8 205 ldr $poly3,.Lpoly+24 206 207 bl __ecp_nistz256_sqr_mont 208 209 ldp x19,x20,[sp,#16] 210 ldp x29,x30,[sp],#32 211 .inst 0xd50323bf // autiasp 212 ret 213.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 214 215// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 216// const BN_ULONG x2[4]); 217.globl ecp_nistz256_add 218.type ecp_nistz256_add,%function 219.align 4 220ecp_nistz256_add: 221 .inst 0xd503233f // paciasp 222 stp x29,x30,[sp,#-16]! 223 add x29,sp,#0 224 225 ldp $acc0,$acc1,[$ap] 226 ldp $t0,$t1,[$bp] 227 ldp $acc2,$acc3,[$ap,#16] 228 ldp $t2,$t3,[$bp,#16] 229 ldr $poly1,.Lpoly+8 230 ldr $poly3,.Lpoly+24 231 232 bl __ecp_nistz256_add 233 234 ldp x29,x30,[sp],#16 235 .inst 0xd50323bf // autiasp 236 ret 237.size ecp_nistz256_add,.-ecp_nistz256_add 238 239// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 240.globl ecp_nistz256_div_by_2 241.type ecp_nistz256_div_by_2,%function 242.align 4 243ecp_nistz256_div_by_2: 244 .inst 0xd503233f // paciasp 245 stp x29,x30,[sp,#-16]! 246 add x29,sp,#0 247 248 ldp $acc0,$acc1,[$ap] 249 ldp $acc2,$acc3,[$ap,#16] 250 ldr $poly1,.Lpoly+8 251 ldr $poly3,.Lpoly+24 252 253 bl __ecp_nistz256_div_by_2 254 255 ldp x29,x30,[sp],#16 256 .inst 0xd50323bf // autiasp 257 ret 258.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 259 260// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 261.globl ecp_nistz256_mul_by_2 262.type ecp_nistz256_mul_by_2,%function 263.align 4 264ecp_nistz256_mul_by_2: 265 .inst 0xd503233f // paciasp 266 stp x29,x30,[sp,#-16]! 267 add x29,sp,#0 268 269 ldp $acc0,$acc1,[$ap] 270 ldp $acc2,$acc3,[$ap,#16] 271 ldr $poly1,.Lpoly+8 272 ldr $poly3,.Lpoly+24 273 mov $t0,$acc0 274 mov $t1,$acc1 275 mov $t2,$acc2 276 mov $t3,$acc3 277 278 bl __ecp_nistz256_add // ret = a+a // 2*a 279 280 ldp x29,x30,[sp],#16 281 .inst 0xd50323bf // autiasp 282 ret 283.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 284 285// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 286.globl ecp_nistz256_mul_by_3 287.type ecp_nistz256_mul_by_3,%function 288.align 4 289ecp_nistz256_mul_by_3: 290 .inst 0xd503233f // paciasp 291 stp x29,x30,[sp,#-16]! 292 add x29,sp,#0 293 294 ldp $acc0,$acc1,[$ap] 295 ldp $acc2,$acc3,[$ap,#16] 296 ldr $poly1,.Lpoly+8 297 ldr $poly3,.Lpoly+24 298 mov $t0,$acc0 299 mov $t1,$acc1 300 mov $t2,$acc2 301 mov $t3,$acc3 302 mov $a0,$acc0 303 mov $a1,$acc1 304 mov $a2,$acc2 305 mov $a3,$acc3 306 307 bl __ecp_nistz256_add // ret = a+a // 2*a 308 309 mov $t0,$a0 310 mov $t1,$a1 311 mov $t2,$a2 312 mov $t3,$a3 313 314 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a 315 316 ldp x29,x30,[sp],#16 317 .inst 0xd50323bf // autiasp 318 ret 319.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 320 321// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 322// const BN_ULONG x2[4]); 323.globl ecp_nistz256_sub 324.type ecp_nistz256_sub,%function 325.align 4 326ecp_nistz256_sub: 327 .inst 0xd503233f // paciasp 328 stp x29,x30,[sp,#-16]! 329 add x29,sp,#0 330 331 ldp $acc0,$acc1,[$ap] 332 ldp $acc2,$acc3,[$ap,#16] 333 ldr $poly1,.Lpoly+8 334 ldr $poly3,.Lpoly+24 335 336 bl __ecp_nistz256_sub_from 337 338 ldp x29,x30,[sp],#16 339 .inst 0xd50323bf // autiasp 340 ret 341.size ecp_nistz256_sub,.-ecp_nistz256_sub 342 343// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 344.globl ecp_nistz256_neg 345.type ecp_nistz256_neg,%function 346.align 4 347ecp_nistz256_neg: 348 .inst 0xd503233f // paciasp 349 stp x29,x30,[sp,#-16]! 350 add x29,sp,#0 351 352 mov $bp,$ap 353 mov $acc0,xzr // a = 0 354 mov $acc1,xzr 355 mov $acc2,xzr 356 mov $acc3,xzr 357 ldr $poly1,.Lpoly+8 358 ldr $poly3,.Lpoly+24 359 360 bl __ecp_nistz256_sub_from 361 362 ldp x29,x30,[sp],#16 363 .inst 0xd50323bf // autiasp 364 ret 365.size ecp_nistz256_neg,.-ecp_nistz256_neg 366 367// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 368// to $a0-$a3 and b[0] - to $bi 369.type __ecp_nistz256_mul_mont,%function 370.align 4 371__ecp_nistz256_mul_mont: 372 mul $acc0,$a0,$bi // a[0]*b[0] 373 umulh $t0,$a0,$bi 374 375 mul $acc1,$a1,$bi // a[1]*b[0] 376 umulh $t1,$a1,$bi 377 378 mul $acc2,$a2,$bi // a[2]*b[0] 379 umulh $t2,$a2,$bi 380 381 mul $acc3,$a3,$bi // a[3]*b[0] 382 umulh $t3,$a3,$bi 383 ldr $bi,[$bp,#8] // b[1] 384 385 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 386 lsl $t0,$acc0,#32 387 adcs $acc2,$acc2,$t1 388 lsr $t1,$acc0,#32 389 adcs $acc3,$acc3,$t2 390 adc $acc4,xzr,$t3 391 mov $acc5,xzr 392___ 393for($i=1;$i<4;$i++) { 394 # Reduction iteration is normally performed by accumulating 395 # result of multiplication of modulus by "magic" digit [and 396 # omitting least significant word, which is guaranteed to 397 # be 0], but thanks to special form of modulus and "magic" 398 # digit being equal to least significant word, it can be 399 # performed with additions and subtractions alone. Indeed: 400 # 401 # ffff0001.00000000.0000ffff.ffffffff 402 # * abcdefgh 403 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 404 # 405 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 406 # rewrite above as: 407 # 408 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 409 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 410 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 411 # 412 # or marking redundant operations: 413 # 414 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 415 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 416 # - 0000abcd.efgh0000.--------.--------.-------- 417 418$code.=<<___; 419 subs $t2,$acc0,$t0 // "*0xffff0001" 420 sbc $t3,$acc0,$t1 421 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 422 mul $t0,$a0,$bi // lo(a[0]*b[i]) 423 adcs $acc1,$acc2,$t1 424 mul $t1,$a1,$bi // lo(a[1]*b[i]) 425 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 426 mul $t2,$a2,$bi // lo(a[2]*b[i]) 427 adcs $acc3,$acc4,$t3 428 mul $t3,$a3,$bi // lo(a[3]*b[i]) 429 adc $acc4,$acc5,xzr 430 431 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 432 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 433 adcs $acc1,$acc1,$t1 434 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 435 adcs $acc2,$acc2,$t2 436 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 437 adcs $acc3,$acc3,$t3 438 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 439 adc $acc4,$acc4,xzr 440___ 441$code.=<<___ if ($i<3); 442 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 443___ 444$code.=<<___; 445 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 446 lsl $t0,$acc0,#32 447 adcs $acc2,$acc2,$t1 448 lsr $t1,$acc0,#32 449 adcs $acc3,$acc3,$t2 450 adcs $acc4,$acc4,$t3 451 adc $acc5,xzr,xzr 452___ 453} 454$code.=<<___; 455 // last reduction 456 subs $t2,$acc0,$t0 // "*0xffff0001" 457 sbc $t3,$acc0,$t1 458 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 459 adcs $acc1,$acc2,$t1 460 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 461 adcs $acc3,$acc4,$t3 462 adc $acc4,$acc5,xzr 463 464 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 465 sbcs $t1,$acc1,$poly1 466 sbcs $t2,$acc2,xzr 467 sbcs $t3,$acc3,$poly3 468 sbcs xzr,$acc4,xzr // did it borrow? 469 470 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 471 csel $acc1,$acc1,$t1,lo 472 csel $acc2,$acc2,$t2,lo 473 stp $acc0,$acc1,[$rp] 474 csel $acc3,$acc3,$t3,lo 475 stp $acc2,$acc3,[$rp,#16] 476 477 ret 478.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 479 480// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 481// to $a0-$a3 482.type __ecp_nistz256_sqr_mont,%function 483.align 4 484__ecp_nistz256_sqr_mont: 485 // | | | | | |a1*a0| | 486 // | | | | |a2*a0| | | 487 // | |a3*a2|a3*a0| | | | 488 // | | | |a2*a1| | | | 489 // | | |a3*a1| | | | | 490 // *| | | | | | | | 2| 491 // +|a3*a3|a2*a2|a1*a1|a0*a0| 492 // |--+--+--+--+--+--+--+--| 493 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 494 // 495 // "can't overflow" below mark carrying into high part of 496 // multiplication result, which can't overflow, because it 497 // can never be all ones. 498 499 mul $acc1,$a1,$a0 // a[1]*a[0] 500 umulh $t1,$a1,$a0 501 mul $acc2,$a2,$a0 // a[2]*a[0] 502 umulh $t2,$a2,$a0 503 mul $acc3,$a3,$a0 // a[3]*a[0] 504 umulh $acc4,$a3,$a0 505 506 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 507 mul $t0,$a2,$a1 // a[2]*a[1] 508 umulh $t1,$a2,$a1 509 adcs $acc3,$acc3,$t2 510 mul $t2,$a3,$a1 // a[3]*a[1] 511 umulh $t3,$a3,$a1 512 adc $acc4,$acc4,xzr // can't overflow 513 514 mul $acc5,$a3,$a2 // a[3]*a[2] 515 umulh $acc6,$a3,$a2 516 517 adds $t1,$t1,$t2 // accumulate high parts of multiplication 518 mul $acc0,$a0,$a0 // a[0]*a[0] 519 adc $t2,$t3,xzr // can't overflow 520 521 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 522 umulh $a0,$a0,$a0 523 adcs $acc4,$acc4,$t1 524 mul $t1,$a1,$a1 // a[1]*a[1] 525 adcs $acc5,$acc5,$t2 526 umulh $a1,$a1,$a1 527 adc $acc6,$acc6,xzr // can't overflow 528 529 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 530 mul $t2,$a2,$a2 // a[2]*a[2] 531 adcs $acc2,$acc2,$acc2 532 umulh $a2,$a2,$a2 533 adcs $acc3,$acc3,$acc3 534 mul $t3,$a3,$a3 // a[3]*a[3] 535 adcs $acc4,$acc4,$acc4 536 umulh $a3,$a3,$a3 537 adcs $acc5,$acc5,$acc5 538 adcs $acc6,$acc6,$acc6 539 adc $acc7,xzr,xzr 540 541 adds $acc1,$acc1,$a0 // +a[i]*a[i] 542 adcs $acc2,$acc2,$t1 543 adcs $acc3,$acc3,$a1 544 adcs $acc4,$acc4,$t2 545 adcs $acc5,$acc5,$a2 546 lsl $t0,$acc0,#32 547 adcs $acc6,$acc6,$t3 548 lsr $t1,$acc0,#32 549 adc $acc7,$acc7,$a3 550___ 551for($i=0;$i<3;$i++) { # reductions, see commentary in 552 # multiplication for details 553$code.=<<___; 554 subs $t2,$acc0,$t0 // "*0xffff0001" 555 sbc $t3,$acc0,$t1 556 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 557 adcs $acc1,$acc2,$t1 558 lsl $t0,$acc0,#32 559 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 560 lsr $t1,$acc0,#32 561 adc $acc3,$t3,xzr // can't overflow 562___ 563} 564$code.=<<___; 565 subs $t2,$acc0,$t0 // "*0xffff0001" 566 sbc $t3,$acc0,$t1 567 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 568 adcs $acc1,$acc2,$t1 569 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 570 adc $acc3,$t3,xzr // can't overflow 571 572 adds $acc0,$acc0,$acc4 // accumulate upper half 573 adcs $acc1,$acc1,$acc5 574 adcs $acc2,$acc2,$acc6 575 adcs $acc3,$acc3,$acc7 576 adc $acc4,xzr,xzr 577 578 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 579 sbcs $t1,$acc1,$poly1 580 sbcs $t2,$acc2,xzr 581 sbcs $t3,$acc3,$poly3 582 sbcs xzr,$acc4,xzr // did it borrow? 583 584 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 585 csel $acc1,$acc1,$t1,lo 586 csel $acc2,$acc2,$t2,lo 587 stp $acc0,$acc1,[$rp] 588 csel $acc3,$acc3,$t3,lo 589 stp $acc2,$acc3,[$rp,#16] 590 591 ret 592.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 593 594// Note that __ecp_nistz256_add expects both input vectors pre-loaded to 595// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 596// contexts, e.g. in multiplication by 2 and 3... 597.type __ecp_nistz256_add,%function 598.align 4 599__ecp_nistz256_add: 600 adds $acc0,$acc0,$t0 // ret = a+b 601 adcs $acc1,$acc1,$t1 602 adcs $acc2,$acc2,$t2 603 adcs $acc3,$acc3,$t3 604 adc $ap,xzr,xzr // zap $ap 605 606 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 607 sbcs $t1,$acc1,$poly1 608 sbcs $t2,$acc2,xzr 609 sbcs $t3,$acc3,$poly3 610 sbcs xzr,$ap,xzr // did subtraction borrow? 611 612 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 613 csel $acc1,$acc1,$t1,lo 614 csel $acc2,$acc2,$t2,lo 615 stp $acc0,$acc1,[$rp] 616 csel $acc3,$acc3,$t3,lo 617 stp $acc2,$acc3,[$rp,#16] 618 619 ret 620.size __ecp_nistz256_add,.-__ecp_nistz256_add 621 622.type __ecp_nistz256_sub_from,%function 623.align 4 624__ecp_nistz256_sub_from: 625 ldp $t0,$t1,[$bp] 626 ldp $t2,$t3,[$bp,#16] 627 subs $acc0,$acc0,$t0 // ret = a-b 628 sbcs $acc1,$acc1,$t1 629 sbcs $acc2,$acc2,$t2 630 sbcs $acc3,$acc3,$t3 631 sbc $ap,xzr,xzr // zap $ap 632 633 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 634 adcs $t1,$acc1,$poly1 635 adcs $t2,$acc2,xzr 636 adc $t3,$acc3,$poly3 637 cmp $ap,xzr // did subtraction borrow? 638 639 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 640 csel $acc1,$acc1,$t1,eq 641 csel $acc2,$acc2,$t2,eq 642 stp $acc0,$acc1,[$rp] 643 csel $acc3,$acc3,$t3,eq 644 stp $acc2,$acc3,[$rp,#16] 645 646 ret 647.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 648 649.type __ecp_nistz256_sub_morf,%function 650.align 4 651__ecp_nistz256_sub_morf: 652 ldp $t0,$t1,[$bp] 653 ldp $t2,$t3,[$bp,#16] 654 subs $acc0,$t0,$acc0 // ret = b-a 655 sbcs $acc1,$t1,$acc1 656 sbcs $acc2,$t2,$acc2 657 sbcs $acc3,$t3,$acc3 658 sbc $ap,xzr,xzr // zap $ap 659 660 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 661 adcs $t1,$acc1,$poly1 662 adcs $t2,$acc2,xzr 663 adc $t3,$acc3,$poly3 664 cmp $ap,xzr // did subtraction borrow? 665 666 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 667 csel $acc1,$acc1,$t1,eq 668 csel $acc2,$acc2,$t2,eq 669 stp $acc0,$acc1,[$rp] 670 csel $acc3,$acc3,$t3,eq 671 stp $acc2,$acc3,[$rp,#16] 672 673 ret 674.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 675 676.type __ecp_nistz256_div_by_2,%function 677.align 4 678__ecp_nistz256_div_by_2: 679 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 680 adcs $t1,$acc1,$poly1 681 adcs $t2,$acc2,xzr 682 adcs $t3,$acc3,$poly3 683 adc $ap,xzr,xzr // zap $ap 684 tst $acc0,#1 // is a even? 685 686 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 687 csel $acc1,$acc1,$t1,eq 688 csel $acc2,$acc2,$t2,eq 689 csel $acc3,$acc3,$t3,eq 690 csel $ap,xzr,$ap,eq 691 692 lsr $acc0,$acc0,#1 // ret >>= 1 693 orr $acc0,$acc0,$acc1,lsl#63 694 lsr $acc1,$acc1,#1 695 orr $acc1,$acc1,$acc2,lsl#63 696 lsr $acc2,$acc2,#1 697 orr $acc2,$acc2,$acc3,lsl#63 698 lsr $acc3,$acc3,#1 699 stp $acc0,$acc1,[$rp] 700 orr $acc3,$acc3,$ap,lsl#63 701 stp $acc2,$acc3,[$rp,#16] 702 703 ret 704.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 705___ 706######################################################################## 707# following subroutines are "literal" implementation of those found in 708# ecp_nistz256.c 709# 710######################################################################## 711# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 712# 713{ 714my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 715# above map() describes stack layout with 4 temporary 716# 256-bit vectors on top. 717my ($rp_real,$ap_real) = map("x$_",(21,22)); 718 719$code.=<<___; 720.globl ecp_nistz256_point_double 721.type ecp_nistz256_point_double,%function 722.align 5 723ecp_nistz256_point_double: 724 .inst 0xd503233f // paciasp 725 stp x29,x30,[sp,#-80]! 726 add x29,sp,#0 727 stp x19,x20,[sp,#16] 728 stp x21,x22,[sp,#32] 729 sub sp,sp,#32*4 730 731.Ldouble_shortcut: 732 ldp $acc0,$acc1,[$ap,#32] 733 mov $rp_real,$rp 734 ldp $acc2,$acc3,[$ap,#48] 735 mov $ap_real,$ap 736 ldr $poly1,.Lpoly+8 737 mov $t0,$acc0 738 ldr $poly3,.Lpoly+24 739 mov $t1,$acc1 740 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 741 mov $t2,$acc2 742 mov $t3,$acc3 743 ldp $a2,$a3,[$ap_real,#64+16] 744 add $rp,sp,#$S 745 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); 746 747 add $rp,sp,#$Zsqr 748 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 749 750 ldp $t0,$t1,[$ap_real] 751 ldp $t2,$t3,[$ap_real,#16] 752 mov $a0,$acc0 // put Zsqr aside for p256_sub 753 mov $a1,$acc1 754 mov $a2,$acc2 755 mov $a3,$acc3 756 add $rp,sp,#$M 757 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); 758 759 add $bp,$ap_real,#0 760 mov $acc0,$a0 // restore Zsqr 761 mov $acc1,$a1 762 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 763 mov $acc2,$a2 764 mov $acc3,$a3 765 ldp $a2,$a3,[sp,#$S+16] 766 add $rp,sp,#$Zsqr 767 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 768 769 add $rp,sp,#$S 770 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 771 772 ldr $bi,[$ap_real,#32] 773 ldp $a0,$a1,[$ap_real,#64] 774 ldp $a2,$a3,[$ap_real,#64+16] 775 add $bp,$ap_real,#32 776 add $rp,sp,#$tmp0 777 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 778 779 mov $t0,$acc0 780 mov $t1,$acc1 781 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 782 mov $t2,$acc2 783 mov $t3,$acc3 784 ldp $a2,$a3,[sp,#$S+16] 785 add $rp,$rp_real,#64 786 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); 787 788 add $rp,sp,#$tmp0 789 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 790 791 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 792 ldp $a0,$a1,[sp,#$M] 793 ldp $a2,$a3,[sp,#$M+16] 794 add $rp,$rp_real,#32 795 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 796 797 add $bp,sp,#$Zsqr 798 add $rp,sp,#$M 799 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 800 801 mov $t0,$acc0 // duplicate M 802 mov $t1,$acc1 803 mov $t2,$acc2 804 mov $t3,$acc3 805 mov $a0,$acc0 // put M aside 806 mov $a1,$acc1 807 mov $a2,$acc2 808 mov $a3,$acc3 809 add $rp,sp,#$M 810 bl __ecp_nistz256_add 811 mov $t0,$a0 // restore M 812 mov $t1,$a1 813 ldr $bi,[$ap_real] // forward load for p256_mul_mont 814 mov $t2,$a2 815 ldp $a0,$a1,[sp,#$S] 816 mov $t3,$a3 817 ldp $a2,$a3,[sp,#$S+16] 818 bl __ecp_nistz256_add // p256_mul_by_3(M, M); 819 820 add $bp,$ap_real,#0 821 add $rp,sp,#$S 822 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 823 824 mov $t0,$acc0 825 mov $t1,$acc1 826 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 827 mov $t2,$acc2 828 mov $t3,$acc3 829 ldp $a2,$a3,[sp,#$M+16] 830 add $rp,sp,#$tmp0 831 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); 832 833 add $rp,$rp_real,#0 834 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 835 836 add $bp,sp,#$tmp0 837 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 838 839 add $bp,sp,#$S 840 add $rp,sp,#$S 841 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 842 843 ldr $bi,[sp,#$M] 844 mov $a0,$acc0 // copy S 845 mov $a1,$acc1 846 mov $a2,$acc2 847 mov $a3,$acc3 848 add $bp,sp,#$M 849 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 850 851 add $bp,$rp_real,#32 852 add $rp,$rp_real,#32 853 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 854 855 add sp,x29,#0 // destroy frame 856 ldp x19,x20,[x29,#16] 857 ldp x21,x22,[x29,#32] 858 ldp x29,x30,[sp],#80 859 .inst 0xd50323bf // autiasp 860 ret 861.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 862___ 863} 864 865######################################################################## 866# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 867# const P256_POINT *in2); 868{ 869my ($res_x,$res_y,$res_z, 870 $H,$Hsqr,$R,$Rsqr,$Hcub, 871 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 872my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 873# above map() describes stack layout with 12 temporary 874# 256-bit vectors on top. 875my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 876 877$code.=<<___; 878.globl ecp_nistz256_point_add 879.type ecp_nistz256_point_add,%function 880.align 5 881ecp_nistz256_point_add: 882 .inst 0xd503233f // paciasp 883 stp x29,x30,[sp,#-80]! 884 add x29,sp,#0 885 stp x19,x20,[sp,#16] 886 stp x21,x22,[sp,#32] 887 stp x23,x24,[sp,#48] 888 stp x25,x26,[sp,#64] 889 sub sp,sp,#32*12 890 891 ldp $a0,$a1,[$bp,#64] // in2_z 892 ldp $a2,$a3,[$bp,#64+16] 893 mov $rp_real,$rp 894 mov $ap_real,$ap 895 mov $bp_real,$bp 896 ldr $poly1,.Lpoly+8 897 ldr $poly3,.Lpoly+24 898 orr $t0,$a0,$a1 899 orr $t2,$a2,$a3 900 orr $in2infty,$t0,$t2 901 cmp $in2infty,#0 902 csetm $in2infty,ne // !in2infty 903 add $rp,sp,#$Z2sqr 904 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 905 906 ldp $a0,$a1,[$ap_real,#64] // in1_z 907 ldp $a2,$a3,[$ap_real,#64+16] 908 orr $t0,$a0,$a1 909 orr $t2,$a2,$a3 910 orr $in1infty,$t0,$t2 911 cmp $in1infty,#0 912 csetm $in1infty,ne // !in1infty 913 add $rp,sp,#$Z1sqr 914 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 915 916 ldr $bi,[$bp_real,#64] 917 ldp $a0,$a1,[sp,#$Z2sqr] 918 ldp $a2,$a3,[sp,#$Z2sqr+16] 919 add $bp,$bp_real,#64 920 add $rp,sp,#$S1 921 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 922 923 ldr $bi,[$ap_real,#64] 924 ldp $a0,$a1,[sp,#$Z1sqr] 925 ldp $a2,$a3,[sp,#$Z1sqr+16] 926 add $bp,$ap_real,#64 927 add $rp,sp,#$S2 928 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 929 930 ldr $bi,[$ap_real,#32] 931 ldp $a0,$a1,[sp,#$S1] 932 ldp $a2,$a3,[sp,#$S1+16] 933 add $bp,$ap_real,#32 934 add $rp,sp,#$S1 935 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 936 937 ldr $bi,[$bp_real,#32] 938 ldp $a0,$a1,[sp,#$S2] 939 ldp $a2,$a3,[sp,#$S2+16] 940 add $bp,$bp_real,#32 941 add $rp,sp,#$S2 942 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 943 944 add $bp,sp,#$S1 945 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 946 ldp $a0,$a1,[$ap_real] 947 ldp $a2,$a3,[$ap_real,#16] 948 add $rp,sp,#$R 949 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 950 951 orr $acc0,$acc0,$acc1 // see if result is zero 952 orr $acc2,$acc2,$acc3 953 orr $temp,$acc0,$acc2 954 955 add $bp,sp,#$Z2sqr 956 add $rp,sp,#$U1 957 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 958 959 ldr $bi,[sp,#$Z1sqr] 960 ldp $a0,$a1,[$bp_real] 961 ldp $a2,$a3,[$bp_real,#16] 962 add $bp,sp,#$Z1sqr 963 add $rp,sp,#$U2 964 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 965 966 add $bp,sp,#$U1 967 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 968 ldp $a2,$a3,[sp,#$R+16] 969 add $rp,sp,#$H 970 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 971 972 orr $acc0,$acc0,$acc1 // see if result is zero 973 orr $acc2,$acc2,$acc3 974 orr $acc0,$acc0,$acc2 975 tst $acc0,$acc0 976 b.ne .Ladd_proceed // is_equal(U1,U2)? 977 978 tst $in1infty,$in2infty 979 b.eq .Ladd_proceed // (in1infty || in2infty)? 980 981 tst $temp,$temp 982 b.eq .Ladd_double // is_equal(S1,S2)? 983 984 eor $a0,$a0,$a0 985 eor $a1,$a1,$a1 986 stp $a0,$a1,[$rp_real] 987 stp $a0,$a1,[$rp_real,#16] 988 stp $a0,$a1,[$rp_real,#32] 989 stp $a0,$a1,[$rp_real,#48] 990 stp $a0,$a1,[$rp_real,#64] 991 stp $a0,$a1,[$rp_real,#80] 992 b .Ladd_done 993 994.align 4 995.Ladd_double: 996 mov $ap,$ap_real 997 mov $rp,$rp_real 998 ldp x23,x24,[x29,#48] 999 ldp x25,x26,[x29,#64] 1000 add sp,sp,#32*(12-4) // difference in stack frames 1001 b .Ldouble_shortcut 1002 1003.align 4 1004.Ladd_proceed: 1005 add $rp,sp,#$Rsqr 1006 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1007 1008 ldr $bi,[$ap_real,#64] 1009 ldp $a0,$a1,[sp,#$H] 1010 ldp $a2,$a3,[sp,#$H+16] 1011 add $bp,$ap_real,#64 1012 add $rp,sp,#$res_z 1013 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1014 1015 ldp $a0,$a1,[sp,#$H] 1016 ldp $a2,$a3,[sp,#$H+16] 1017 add $rp,sp,#$Hsqr 1018 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1019 1020 ldr $bi,[$bp_real,#64] 1021 ldp $a0,$a1,[sp,#$res_z] 1022 ldp $a2,$a3,[sp,#$res_z+16] 1023 add $bp,$bp_real,#64 1024 add $rp,sp,#$res_z 1025 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 1026 1027 ldr $bi,[sp,#$H] 1028 ldp $a0,$a1,[sp,#$Hsqr] 1029 ldp $a2,$a3,[sp,#$Hsqr+16] 1030 add $bp,sp,#$H 1031 add $rp,sp,#$Hcub 1032 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1033 1034 ldr $bi,[sp,#$Hsqr] 1035 ldp $a0,$a1,[sp,#$U1] 1036 ldp $a2,$a3,[sp,#$U1+16] 1037 add $bp,sp,#$Hsqr 1038 add $rp,sp,#$U2 1039 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 1040 1041 mov $t0,$acc0 1042 mov $t1,$acc1 1043 mov $t2,$acc2 1044 mov $t3,$acc3 1045 add $rp,sp,#$Hsqr 1046 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1047 1048 add $bp,sp,#$Rsqr 1049 add $rp,sp,#$res_x 1050 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1051 1052 add $bp,sp,#$Hcub 1053 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1054 1055 add $bp,sp,#$U2 1056 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 1057 ldp $a0,$a1,[sp,#$S1] 1058 ldp $a2,$a3,[sp,#$S1+16] 1059 add $rp,sp,#$res_y 1060 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1061 1062 add $bp,sp,#$Hcub 1063 add $rp,sp,#$S2 1064 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 1065 1066 ldr $bi,[sp,#$R] 1067 ldp $a0,$a1,[sp,#$res_y] 1068 ldp $a2,$a3,[sp,#$res_y+16] 1069 add $bp,sp,#$R 1070 add $rp,sp,#$res_y 1071 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1072 1073 add $bp,sp,#$S2 1074 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1075 1076 ldp $a0,$a1,[sp,#$res_x] // res 1077 ldp $a2,$a3,[sp,#$res_x+16] 1078 ldp $t0,$t1,[$bp_real] // in2 1079 ldp $t2,$t3,[$bp_real,#16] 1080___ 1081for($i=0;$i<64;$i+=32) { # conditional moves 1082$code.=<<___; 1083 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1084 cmp $in1infty,#0 // !$in1intfy, remember? 1085 ldp $acc2,$acc3,[$ap_real,#$i+16] 1086 csel $t0,$a0,$t0,ne 1087 csel $t1,$a1,$t1,ne 1088 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1089 csel $t2,$a2,$t2,ne 1090 csel $t3,$a3,$t3,ne 1091 cmp $in2infty,#0 // !$in2intfy, remember? 1092 ldp $a2,$a3,[sp,#$res_x+$i+48] 1093 csel $acc0,$t0,$acc0,ne 1094 csel $acc1,$t1,$acc1,ne 1095 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1096 csel $acc2,$t2,$acc2,ne 1097 csel $acc3,$t3,$acc3,ne 1098 ldp $t2,$t3,[$bp_real,#$i+48] 1099 stp $acc0,$acc1,[$rp_real,#$i] 1100 stp $acc2,$acc3,[$rp_real,#$i+16] 1101___ 1102} 1103$code.=<<___; 1104 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1105 cmp $in1infty,#0 // !$in1intfy, remember? 1106 ldp $acc2,$acc3,[$ap_real,#$i+16] 1107 csel $t0,$a0,$t0,ne 1108 csel $t1,$a1,$t1,ne 1109 csel $t2,$a2,$t2,ne 1110 csel $t3,$a3,$t3,ne 1111 cmp $in2infty,#0 // !$in2intfy, remember? 1112 csel $acc0,$t0,$acc0,ne 1113 csel $acc1,$t1,$acc1,ne 1114 csel $acc2,$t2,$acc2,ne 1115 csel $acc3,$t3,$acc3,ne 1116 stp $acc0,$acc1,[$rp_real,#$i] 1117 stp $acc2,$acc3,[$rp_real,#$i+16] 1118 1119.Ladd_done: 1120 add sp,x29,#0 // destroy frame 1121 ldp x19,x20,[x29,#16] 1122 ldp x21,x22,[x29,#32] 1123 ldp x23,x24,[x29,#48] 1124 ldp x25,x26,[x29,#64] 1125 ldp x29,x30,[sp],#80 1126 .inst 0xd50323bf // autiasp 1127 ret 1128.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1129___ 1130} 1131 1132######################################################################## 1133# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1134# const P256_POINT_AFFINE *in2); 1135{ 1136my ($res_x,$res_y,$res_z, 1137 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1138my $Z1sqr = $S2; 1139# above map() describes stack layout with 10 temporary 1140# 256-bit vectors on top. 1141my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 1142 1143$code.=<<___; 1144.globl ecp_nistz256_point_add_affine 1145.type ecp_nistz256_point_add_affine,%function 1146.align 5 1147ecp_nistz256_point_add_affine: 1148 .inst 0xd503233f // paciasp 1149 stp x29,x30,[sp,#-80]! 1150 add x29,sp,#0 1151 stp x19,x20,[sp,#16] 1152 stp x21,x22,[sp,#32] 1153 stp x23,x24,[sp,#48] 1154 stp x25,x26,[sp,#64] 1155 sub sp,sp,#32*10 1156 1157 mov $rp_real,$rp 1158 mov $ap_real,$ap 1159 mov $bp_real,$bp 1160 ldr $poly1,.Lpoly+8 1161 ldr $poly3,.Lpoly+24 1162 1163 ldp $a0,$a1,[$ap,#64] // in1_z 1164 ldp $a2,$a3,[$ap,#64+16] 1165 orr $t0,$a0,$a1 1166 orr $t2,$a2,$a3 1167 orr $in1infty,$t0,$t2 1168 cmp $in1infty,#0 1169 csetm $in1infty,ne // !in1infty 1170 1171 ldp $acc0,$acc1,[$bp] // in2_x 1172 ldp $acc2,$acc3,[$bp,#16] 1173 ldp $t0,$t1,[$bp,#32] // in2_y 1174 ldp $t2,$t3,[$bp,#48] 1175 orr $acc0,$acc0,$acc1 1176 orr $acc2,$acc2,$acc3 1177 orr $t0,$t0,$t1 1178 orr $t2,$t2,$t3 1179 orr $acc0,$acc0,$acc2 1180 orr $t0,$t0,$t2 1181 orr $in2infty,$acc0,$t0 1182 cmp $in2infty,#0 1183 csetm $in2infty,ne // !in2infty 1184 1185 add $rp,sp,#$Z1sqr 1186 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1187 1188 mov $a0,$acc0 1189 mov $a1,$acc1 1190 mov $a2,$acc2 1191 mov $a3,$acc3 1192 ldr $bi,[$bp_real] 1193 add $bp,$bp_real,#0 1194 add $rp,sp,#$U2 1195 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1196 1197 add $bp,$ap_real,#0 1198 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 1199 ldp $a0,$a1,[sp,#$Z1sqr] 1200 ldp $a2,$a3,[sp,#$Z1sqr+16] 1201 add $rp,sp,#$H 1202 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1203 1204 add $bp,$ap_real,#64 1205 add $rp,sp,#$S2 1206 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1207 1208 ldr $bi,[$ap_real,#64] 1209 ldp $a0,$a1,[sp,#$H] 1210 ldp $a2,$a3,[sp,#$H+16] 1211 add $bp,$ap_real,#64 1212 add $rp,sp,#$res_z 1213 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1214 1215 ldr $bi,[$bp_real,#32] 1216 ldp $a0,$a1,[sp,#$S2] 1217 ldp $a2,$a3,[sp,#$S2+16] 1218 add $bp,$bp_real,#32 1219 add $rp,sp,#$S2 1220 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1221 1222 add $bp,$ap_real,#32 1223 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1224 ldp $a2,$a3,[sp,#$H+16] 1225 add $rp,sp,#$R 1226 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1227 1228 add $rp,sp,#$Hsqr 1229 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1230 1231 ldp $a0,$a1,[sp,#$R] 1232 ldp $a2,$a3,[sp,#$R+16] 1233 add $rp,sp,#$Rsqr 1234 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1235 1236 ldr $bi,[sp,#$H] 1237 ldp $a0,$a1,[sp,#$Hsqr] 1238 ldp $a2,$a3,[sp,#$Hsqr+16] 1239 add $bp,sp,#$H 1240 add $rp,sp,#$Hcub 1241 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1242 1243 ldr $bi,[$ap_real] 1244 ldp $a0,$a1,[sp,#$Hsqr] 1245 ldp $a2,$a3,[sp,#$Hsqr+16] 1246 add $bp,$ap_real,#0 1247 add $rp,sp,#$U2 1248 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1249 1250 mov $t0,$acc0 1251 mov $t1,$acc1 1252 mov $t2,$acc2 1253 mov $t3,$acc3 1254 add $rp,sp,#$Hsqr 1255 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1256 1257 add $bp,sp,#$Rsqr 1258 add $rp,sp,#$res_x 1259 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1260 1261 add $bp,sp,#$Hcub 1262 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1263 1264 add $bp,sp,#$U2 1265 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1266 ldp $a0,$a1,[sp,#$Hcub] 1267 ldp $a2,$a3,[sp,#$Hcub+16] 1268 add $rp,sp,#$res_y 1269 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1270 1271 add $bp,$ap_real,#32 1272 add $rp,sp,#$S2 1273 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1274 1275 ldr $bi,[sp,#$R] 1276 ldp $a0,$a1,[sp,#$res_y] 1277 ldp $a2,$a3,[sp,#$res_y+16] 1278 add $bp,sp,#$R 1279 add $rp,sp,#$res_y 1280 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1281 1282 add $bp,sp,#$S2 1283 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1284 1285 ldp $a0,$a1,[sp,#$res_x] // res 1286 ldp $a2,$a3,[sp,#$res_x+16] 1287 ldp $t0,$t1,[$bp_real] // in2 1288 ldp $t2,$t3,[$bp_real,#16] 1289___ 1290for($i=0;$i<64;$i+=32) { # conditional moves 1291$code.=<<___; 1292 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1293 cmp $in1infty,#0 // !$in1intfy, remember? 1294 ldp $acc2,$acc3,[$ap_real,#$i+16] 1295 csel $t0,$a0,$t0,ne 1296 csel $t1,$a1,$t1,ne 1297 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1298 csel $t2,$a2,$t2,ne 1299 csel $t3,$a3,$t3,ne 1300 cmp $in2infty,#0 // !$in2intfy, remember? 1301 ldp $a2,$a3,[sp,#$res_x+$i+48] 1302 csel $acc0,$t0,$acc0,ne 1303 csel $acc1,$t1,$acc1,ne 1304 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1305 csel $acc2,$t2,$acc2,ne 1306 csel $acc3,$t3,$acc3,ne 1307 ldp $t2,$t3,[$bp_real,#$i+48] 1308 stp $acc0,$acc1,[$rp_real,#$i] 1309 stp $acc2,$acc3,[$rp_real,#$i+16] 1310___ 1311$code.=<<___ if ($i == 0); 1312 adr $bp_real,.Lone_mont-64 1313___ 1314} 1315$code.=<<___; 1316 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1317 cmp $in1infty,#0 // !$in1intfy, remember? 1318 ldp $acc2,$acc3,[$ap_real,#$i+16] 1319 csel $t0,$a0,$t0,ne 1320 csel $t1,$a1,$t1,ne 1321 csel $t2,$a2,$t2,ne 1322 csel $t3,$a3,$t3,ne 1323 cmp $in2infty,#0 // !$in2intfy, remember? 1324 csel $acc0,$t0,$acc0,ne 1325 csel $acc1,$t1,$acc1,ne 1326 csel $acc2,$t2,$acc2,ne 1327 csel $acc3,$t3,$acc3,ne 1328 stp $acc0,$acc1,[$rp_real,#$i] 1329 stp $acc2,$acc3,[$rp_real,#$i+16] 1330 1331 add sp,x29,#0 // destroy frame 1332 ldp x19,x20,[x29,#16] 1333 ldp x21,x22,[x29,#32] 1334 ldp x23,x24,[x29,#48] 1335 ldp x25,x26,[x29,#64] 1336 ldp x29,x30,[sp],#80 1337 .inst 0xd50323bf // autiasp 1338 ret 1339.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1340___ 1341} 1342if (1) { 1343my ($ord0,$ord1) = ($poly1,$poly3); 1344my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1345my $acc7 = $bi; 1346 1347$code.=<<___; 1348//////////////////////////////////////////////////////////////////////// 1349// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1350// uint64_t b[4]); 1351.globl ecp_nistz256_ord_mul_mont 1352.type ecp_nistz256_ord_mul_mont,%function 1353.align 4 1354ecp_nistz256_ord_mul_mont: 1355 stp x29,x30,[sp,#-64]! 1356 add x29,sp,#0 1357 stp x19,x20,[sp,#16] 1358 stp x21,x22,[sp,#32] 1359 stp x23,x24,[sp,#48] 1360 1361 adr $ordk,.Lord 1362 ldr $bi,[$bp] // bp[0] 1363 ldp $a0,$a1,[$ap] 1364 ldp $a2,$a3,[$ap,#16] 1365 1366 ldp $ord0,$ord1,[$ordk,#0] 1367 ldp $ord2,$ord3,[$ordk,#16] 1368 ldr $ordk,[$ordk,#32] 1369 1370 mul $acc0,$a0,$bi // a[0]*b[0] 1371 umulh $t0,$a0,$bi 1372 1373 mul $acc1,$a1,$bi // a[1]*b[0] 1374 umulh $t1,$a1,$bi 1375 1376 mul $acc2,$a2,$bi // a[2]*b[0] 1377 umulh $t2,$a2,$bi 1378 1379 mul $acc3,$a3,$bi // a[3]*b[0] 1380 umulh $acc4,$a3,$bi 1381 1382 mul $t4,$acc0,$ordk 1383 1384 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1385 adcs $acc2,$acc2,$t1 1386 adcs $acc3,$acc3,$t2 1387 adc $acc4,$acc4,xzr 1388 mov $acc5,xzr 1389___ 1390for ($i=1;$i<4;$i++) { 1391 ################################################################ 1392 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1393 # * abcdefgh 1394 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1395 # 1396 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1397 # rewrite above as: 1398 # 1399 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1400 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1401 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1402$code.=<<___; 1403 ldr $bi,[$bp,#8*$i] // b[i] 1404 1405 lsl $t0,$t4,#32 1406 subs $acc2,$acc2,$t4 1407 lsr $t1,$t4,#32 1408 sbcs $acc3,$acc3,$t0 1409 sbcs $acc4,$acc4,$t1 1410 sbc $acc5,$acc5,xzr 1411 1412 subs xzr,$acc0,#1 1413 umulh $t1,$ord0,$t4 1414 mul $t2,$ord1,$t4 1415 umulh $t3,$ord1,$t4 1416 1417 adcs $t2,$t2,$t1 1418 mul $t0,$a0,$bi 1419 adc $t3,$t3,xzr 1420 mul $t1,$a1,$bi 1421 1422 adds $acc0,$acc1,$t2 1423 mul $t2,$a2,$bi 1424 adcs $acc1,$acc2,$t3 1425 mul $t3,$a3,$bi 1426 adcs $acc2,$acc3,$t4 1427 adcs $acc3,$acc4,$t4 1428 adc $acc4,$acc5,xzr 1429 1430 adds $acc0,$acc0,$t0 // accumulate low parts 1431 umulh $t0,$a0,$bi 1432 adcs $acc1,$acc1,$t1 1433 umulh $t1,$a1,$bi 1434 adcs $acc2,$acc2,$t2 1435 umulh $t2,$a2,$bi 1436 adcs $acc3,$acc3,$t3 1437 umulh $t3,$a3,$bi 1438 adc $acc4,$acc4,xzr 1439 mul $t4,$acc0,$ordk 1440 adds $acc1,$acc1,$t0 // accumulate high parts 1441 adcs $acc2,$acc2,$t1 1442 adcs $acc3,$acc3,$t2 1443 adcs $acc4,$acc4,$t3 1444 adc $acc5,xzr,xzr 1445___ 1446} 1447$code.=<<___; 1448 lsl $t0,$t4,#32 // last reduction 1449 subs $acc2,$acc2,$t4 1450 lsr $t1,$t4,#32 1451 sbcs $acc3,$acc3,$t0 1452 sbcs $acc4,$acc4,$t1 1453 sbc $acc5,$acc5,xzr 1454 1455 subs xzr,$acc0,#1 1456 umulh $t1,$ord0,$t4 1457 mul $t2,$ord1,$t4 1458 umulh $t3,$ord1,$t4 1459 1460 adcs $t2,$t2,$t1 1461 adc $t3,$t3,xzr 1462 1463 adds $acc0,$acc1,$t2 1464 adcs $acc1,$acc2,$t3 1465 adcs $acc2,$acc3,$t4 1466 adcs $acc3,$acc4,$t4 1467 adc $acc4,$acc5,xzr 1468 1469 subs $t0,$acc0,$ord0 // ret -= modulus 1470 sbcs $t1,$acc1,$ord1 1471 sbcs $t2,$acc2,$ord2 1472 sbcs $t3,$acc3,$ord3 1473 sbcs xzr,$acc4,xzr 1474 1475 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1476 csel $acc1,$acc1,$t1,lo 1477 csel $acc2,$acc2,$t2,lo 1478 stp $acc0,$acc1,[$rp] 1479 csel $acc3,$acc3,$t3,lo 1480 stp $acc2,$acc3,[$rp,#16] 1481 1482 ldp x19,x20,[sp,#16] 1483 ldp x21,x22,[sp,#32] 1484 ldp x23,x24,[sp,#48] 1485 ldr x29,[sp],#64 1486 ret 1487.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1488 1489//////////////////////////////////////////////////////////////////////// 1490// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1491// int rep); 1492.globl ecp_nistz256_ord_sqr_mont 1493.type ecp_nistz256_ord_sqr_mont,%function 1494.align 4 1495ecp_nistz256_ord_sqr_mont: 1496 stp x29,x30,[sp,#-64]! 1497 add x29,sp,#0 1498 stp x19,x20,[sp,#16] 1499 stp x21,x22,[sp,#32] 1500 stp x23,x24,[sp,#48] 1501 1502 adr $ordk,.Lord 1503 ldp $a0,$a1,[$ap] 1504 ldp $a2,$a3,[$ap,#16] 1505 1506 ldp $ord0,$ord1,[$ordk,#0] 1507 ldp $ord2,$ord3,[$ordk,#16] 1508 ldr $ordk,[$ordk,#32] 1509 b .Loop_ord_sqr 1510 1511.align 4 1512.Loop_ord_sqr: 1513 sub $bp,$bp,#1 1514 //////////////////////////////////////////////////////////////// 1515 // | | | | | |a1*a0| | 1516 // | | | | |a2*a0| | | 1517 // | |a3*a2|a3*a0| | | | 1518 // | | | |a2*a1| | | | 1519 // | | |a3*a1| | | | | 1520 // *| | | | | | | | 2| 1521 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1522 // |--+--+--+--+--+--+--+--| 1523 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1524 // 1525 // "can't overflow" below mark carrying into high part of 1526 // multiplication result, which can't overflow, because it 1527 // can never be all ones. 1528 1529 mul $acc1,$a1,$a0 // a[1]*a[0] 1530 umulh $t1,$a1,$a0 1531 mul $acc2,$a2,$a0 // a[2]*a[0] 1532 umulh $t2,$a2,$a0 1533 mul $acc3,$a3,$a0 // a[3]*a[0] 1534 umulh $acc4,$a3,$a0 1535 1536 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1537 mul $t0,$a2,$a1 // a[2]*a[1] 1538 umulh $t1,$a2,$a1 1539 adcs $acc3,$acc3,$t2 1540 mul $t2,$a3,$a1 // a[3]*a[1] 1541 umulh $t3,$a3,$a1 1542 adc $acc4,$acc4,xzr // can't overflow 1543 1544 mul $acc5,$a3,$a2 // a[3]*a[2] 1545 umulh $acc6,$a3,$a2 1546 1547 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1548 mul $acc0,$a0,$a0 // a[0]*a[0] 1549 adc $t2,$t3,xzr // can't overflow 1550 1551 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1552 umulh $a0,$a0,$a0 1553 adcs $acc4,$acc4,$t1 1554 mul $t1,$a1,$a1 // a[1]*a[1] 1555 adcs $acc5,$acc5,$t2 1556 umulh $a1,$a1,$a1 1557 adc $acc6,$acc6,xzr // can't overflow 1558 1559 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1560 mul $t2,$a2,$a2 // a[2]*a[2] 1561 adcs $acc2,$acc2,$acc2 1562 umulh $a2,$a2,$a2 1563 adcs $acc3,$acc3,$acc3 1564 mul $t3,$a3,$a3 // a[3]*a[3] 1565 adcs $acc4,$acc4,$acc4 1566 umulh $a3,$a3,$a3 1567 adcs $acc5,$acc5,$acc5 1568 adcs $acc6,$acc6,$acc6 1569 adc $acc7,xzr,xzr 1570 1571 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1572 mul $t4,$acc0,$ordk 1573 adcs $acc2,$acc2,$t1 1574 adcs $acc3,$acc3,$a1 1575 adcs $acc4,$acc4,$t2 1576 adcs $acc5,$acc5,$a2 1577 adcs $acc6,$acc6,$t3 1578 adc $acc7,$acc7,$a3 1579___ 1580for($i=0; $i<4; $i++) { # reductions 1581$code.=<<___; 1582 subs xzr,$acc0,#1 1583 umulh $t1,$ord0,$t4 1584 mul $t2,$ord1,$t4 1585 umulh $t3,$ord1,$t4 1586 1587 adcs $t2,$t2,$t1 1588 adc $t3,$t3,xzr 1589 1590 adds $acc0,$acc1,$t2 1591 adcs $acc1,$acc2,$t3 1592 adcs $acc2,$acc3,$t4 1593 adc $acc3,xzr,$t4 // can't overflow 1594___ 1595$code.=<<___ if ($i<3); 1596 mul $t3,$acc0,$ordk 1597___ 1598$code.=<<___; 1599 lsl $t0,$t4,#32 1600 subs $acc1,$acc1,$t4 1601 lsr $t1,$t4,#32 1602 sbcs $acc2,$acc2,$t0 1603 sbc $acc3,$acc3,$t1 // can't borrow 1604___ 1605 ($t3,$t4) = ($t4,$t3); 1606} 1607$code.=<<___; 1608 adds $acc0,$acc0,$acc4 // accumulate upper half 1609 adcs $acc1,$acc1,$acc5 1610 adcs $acc2,$acc2,$acc6 1611 adcs $acc3,$acc3,$acc7 1612 adc $acc4,xzr,xzr 1613 1614 subs $t0,$acc0,$ord0 // ret -= modulus 1615 sbcs $t1,$acc1,$ord1 1616 sbcs $t2,$acc2,$ord2 1617 sbcs $t3,$acc3,$ord3 1618 sbcs xzr,$acc4,xzr 1619 1620 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1621 csel $a1,$acc1,$t1,lo 1622 csel $a2,$acc2,$t2,lo 1623 csel $a3,$acc3,$t3,lo 1624 1625 cbnz $bp,.Loop_ord_sqr 1626 1627 stp $a0,$a1,[$rp] 1628 stp $a2,$a3,[$rp,#16] 1629 1630 ldp x19,x20,[sp,#16] 1631 ldp x21,x22,[sp,#32] 1632 ldp x23,x24,[sp,#48] 1633 ldr x29,[sp],#64 1634 ret 1635.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1636___ 1637} } 1638 1639######################################################################## 1640# scatter-gather subroutines 1641{ 1642my ($out,$inp,$index,$mask)=map("x$_",(0..3)); 1643$code.=<<___; 1644// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, 1645// int x2); 1646.globl ecp_nistz256_scatter_w5 1647.type ecp_nistz256_scatter_w5,%function 1648.align 4 1649ecp_nistz256_scatter_w5: 1650 stp x29,x30,[sp,#-16]! 1651 add x29,sp,#0 1652 1653 add $out,$out,$index,lsl#2 1654 1655 ldp x4,x5,[$inp] // X 1656 ldp x6,x7,[$inp,#16] 1657 str w4,[$out,#64*0-4] 1658 lsr x4,x4,#32 1659 str w5,[$out,#64*1-4] 1660 lsr x5,x5,#32 1661 str w6,[$out,#64*2-4] 1662 lsr x6,x6,#32 1663 str w7,[$out,#64*3-4] 1664 lsr x7,x7,#32 1665 str w4,[$out,#64*4-4] 1666 str w5,[$out,#64*5-4] 1667 str w6,[$out,#64*6-4] 1668 str w7,[$out,#64*7-4] 1669 add $out,$out,#64*8 1670 1671 ldp x4,x5,[$inp,#32] // Y 1672 ldp x6,x7,[$inp,#48] 1673 str w4,[$out,#64*0-4] 1674 lsr x4,x4,#32 1675 str w5,[$out,#64*1-4] 1676 lsr x5,x5,#32 1677 str w6,[$out,#64*2-4] 1678 lsr x6,x6,#32 1679 str w7,[$out,#64*3-4] 1680 lsr x7,x7,#32 1681 str w4,[$out,#64*4-4] 1682 str w5,[$out,#64*5-4] 1683 str w6,[$out,#64*6-4] 1684 str w7,[$out,#64*7-4] 1685 add $out,$out,#64*8 1686 1687 ldp x4,x5,[$inp,#64] // Z 1688 ldp x6,x7,[$inp,#80] 1689 str w4,[$out,#64*0-4] 1690 lsr x4,x4,#32 1691 str w5,[$out,#64*1-4] 1692 lsr x5,x5,#32 1693 str w6,[$out,#64*2-4] 1694 lsr x6,x6,#32 1695 str w7,[$out,#64*3-4] 1696 lsr x7,x7,#32 1697 str w4,[$out,#64*4-4] 1698 str w5,[$out,#64*5-4] 1699 str w6,[$out,#64*6-4] 1700 str w7,[$out,#64*7-4] 1701 1702 ldr x29,[sp],#16 1703 ret 1704.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1705 1706// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, 1707// int x2); 1708.globl ecp_nistz256_gather_w5 1709.type ecp_nistz256_gather_w5,%function 1710.align 4 1711ecp_nistz256_gather_w5: 1712 stp x29,x30,[sp,#-16]! 1713 add x29,sp,#0 1714 1715 cmp $index,xzr 1716 csetm x3,ne 1717 add $index,$index,x3 1718 add $inp,$inp,$index,lsl#2 1719 1720 ldr w4,[$inp,#64*0] 1721 ldr w5,[$inp,#64*1] 1722 ldr w6,[$inp,#64*2] 1723 ldr w7,[$inp,#64*3] 1724 ldr w8,[$inp,#64*4] 1725 ldr w9,[$inp,#64*5] 1726 ldr w10,[$inp,#64*6] 1727 ldr w11,[$inp,#64*7] 1728 add $inp,$inp,#64*8 1729 orr x4,x4,x8,lsl#32 1730 orr x5,x5,x9,lsl#32 1731 orr x6,x6,x10,lsl#32 1732 orr x7,x7,x11,lsl#32 1733 csel x4,x4,xzr,ne 1734 csel x5,x5,xzr,ne 1735 csel x6,x6,xzr,ne 1736 csel x7,x7,xzr,ne 1737 stp x4,x5,[$out] // X 1738 stp x6,x7,[$out,#16] 1739 1740 ldr w4,[$inp,#64*0] 1741 ldr w5,[$inp,#64*1] 1742 ldr w6,[$inp,#64*2] 1743 ldr w7,[$inp,#64*3] 1744 ldr w8,[$inp,#64*4] 1745 ldr w9,[$inp,#64*5] 1746 ldr w10,[$inp,#64*6] 1747 ldr w11,[$inp,#64*7] 1748 add $inp,$inp,#64*8 1749 orr x4,x4,x8,lsl#32 1750 orr x5,x5,x9,lsl#32 1751 orr x6,x6,x10,lsl#32 1752 orr x7,x7,x11,lsl#32 1753 csel x4,x4,xzr,ne 1754 csel x5,x5,xzr,ne 1755 csel x6,x6,xzr,ne 1756 csel x7,x7,xzr,ne 1757 stp x4,x5,[$out,#32] // Y 1758 stp x6,x7,[$out,#48] 1759 1760 ldr w4,[$inp,#64*0] 1761 ldr w5,[$inp,#64*1] 1762 ldr w6,[$inp,#64*2] 1763 ldr w7,[$inp,#64*3] 1764 ldr w8,[$inp,#64*4] 1765 ldr w9,[$inp,#64*5] 1766 ldr w10,[$inp,#64*6] 1767 ldr w11,[$inp,#64*7] 1768 orr x4,x4,x8,lsl#32 1769 orr x5,x5,x9,lsl#32 1770 orr x6,x6,x10,lsl#32 1771 orr x7,x7,x11,lsl#32 1772 csel x4,x4,xzr,ne 1773 csel x5,x5,xzr,ne 1774 csel x6,x6,xzr,ne 1775 csel x7,x7,xzr,ne 1776 stp x4,x5,[$out,#64] // Z 1777 stp x6,x7,[$out,#80] 1778 1779 ldr x29,[sp],#16 1780 ret 1781.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1782 1783// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, 1784// int x2); 1785.globl ecp_nistz256_scatter_w7 1786.type ecp_nistz256_scatter_w7,%function 1787.align 4 1788ecp_nistz256_scatter_w7: 1789 stp x29,x30,[sp,#-16]! 1790 add x29,sp,#0 1791 1792 add $out,$out,$index 1793 mov $index,#64/8 1794.Loop_scatter_w7: 1795 ldr x3,[$inp],#8 1796 subs $index,$index,#1 1797 prfm pstl1strm,[$out,#4096+64*0] 1798 prfm pstl1strm,[$out,#4096+64*1] 1799 prfm pstl1strm,[$out,#4096+64*2] 1800 prfm pstl1strm,[$out,#4096+64*3] 1801 prfm pstl1strm,[$out,#4096+64*4] 1802 prfm pstl1strm,[$out,#4096+64*5] 1803 prfm pstl1strm,[$out,#4096+64*6] 1804 prfm pstl1strm,[$out,#4096+64*7] 1805 strb w3,[$out,#64*0] 1806 lsr x3,x3,#8 1807 strb w3,[$out,#64*1] 1808 lsr x3,x3,#8 1809 strb w3,[$out,#64*2] 1810 lsr x3,x3,#8 1811 strb w3,[$out,#64*3] 1812 lsr x3,x3,#8 1813 strb w3,[$out,#64*4] 1814 lsr x3,x3,#8 1815 strb w3,[$out,#64*5] 1816 lsr x3,x3,#8 1817 strb w3,[$out,#64*6] 1818 lsr x3,x3,#8 1819 strb w3,[$out,#64*7] 1820 add $out,$out,#64*8 1821 b.ne .Loop_scatter_w7 1822 1823 ldr x29,[sp],#16 1824 ret 1825.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1826 1827// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, 1828// int x2); 1829.globl ecp_nistz256_gather_w7 1830.type ecp_nistz256_gather_w7,%function 1831.align 4 1832ecp_nistz256_gather_w7: 1833 stp x29,x30,[sp,#-16]! 1834 add x29,sp,#0 1835 1836 cmp $index,xzr 1837 csetm x3,ne 1838 add $index,$index,x3 1839 add $inp,$inp,$index 1840 mov $index,#64/8 1841 nop 1842.Loop_gather_w7: 1843 ldrb w4,[$inp,#64*0] 1844 prfm pldl1strm,[$inp,#4096+64*0] 1845 subs $index,$index,#1 1846 ldrb w5,[$inp,#64*1] 1847 prfm pldl1strm,[$inp,#4096+64*1] 1848 ldrb w6,[$inp,#64*2] 1849 prfm pldl1strm,[$inp,#4096+64*2] 1850 ldrb w7,[$inp,#64*3] 1851 prfm pldl1strm,[$inp,#4096+64*3] 1852 ldrb w8,[$inp,#64*4] 1853 prfm pldl1strm,[$inp,#4096+64*4] 1854 ldrb w9,[$inp,#64*5] 1855 prfm pldl1strm,[$inp,#4096+64*5] 1856 ldrb w10,[$inp,#64*6] 1857 prfm pldl1strm,[$inp,#4096+64*6] 1858 ldrb w11,[$inp,#64*7] 1859 prfm pldl1strm,[$inp,#4096+64*7] 1860 add $inp,$inp,#64*8 1861 orr x4,x4,x5,lsl#8 1862 orr x6,x6,x7,lsl#8 1863 orr x8,x8,x9,lsl#8 1864 orr x4,x4,x6,lsl#16 1865 orr x10,x10,x11,lsl#8 1866 orr x4,x4,x8,lsl#32 1867 orr x4,x4,x10,lsl#48 1868 and x4,x4,x3 1869 str x4,[$out],#8 1870 b.ne .Loop_gather_w7 1871 1872 ldr x29,[sp],#16 1873 ret 1874.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1875___ 1876} 1877 1878foreach (split("\n",$code)) { 1879 s/\`([^\`]*)\`/eval $1/ge; 1880 1881 print $_,"\n"; 1882} 1883close STDOUT; # enforce flush 1884