1#! /usr/bin/env perl 2# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34$flavour = shift; 35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 40die "can't locate arm-xlate.pl"; 41 42open OUT,"| \"$^X\" $xlate $flavour $output"; 43*STDOUT=*OUT; 44 45{ 46my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 48 map("x$_",(0..17,19,20)); 49 50my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 51 52$code.=<<___; 53#include "arm_arch.h" 54 55.text 56___ 57######################################################################## 58# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 59# 60$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 61open TABLE,"<ecp_nistz256_table.c" or 62open TABLE,"<${dir}../ecp_nistz256_table.c" or 63die "failed to open ecp_nistz256_table.c:",$!; 64 65use integer; 66 67foreach(<TABLE>) { 68 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 69} 70close TABLE; 71 72# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 73# 64*16*37-1 is because $#arr returns last valid index or @arr, not 74# amount of elements. 75die "insane number of elements" if ($#arr != 64*16*37-1); 76 77$code.=<<___; 78.globl ecp_nistz256_precomputed 79.type ecp_nistz256_precomputed,%object 80.align 12 81ecp_nistz256_precomputed: 82___ 83######################################################################## 84# this conversion smashes P256_POINT_AFFINE by individual bytes with 85# 64 byte interval, similar to 86# 1111222233334444 87# 1234123412341234 88for(1..37) { 89 @tbl = splice(@arr,0,64*16); 90 for($i=0;$i<64;$i++) { 91 undef @line; 92 for($j=0;$j<64;$j++) { 93 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 94 } 95 $code.=".byte\t"; 96 $code.=join(',',map { sprintf "0x%02x",$_} @line); 97 $code.="\n"; 98 } 99} 100$code.=<<___; 101.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 102.align 5 103.Lpoly: 104.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 105.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 106.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 107.Lone_mont: 108.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 109.Lone: 110.quad 1,0,0,0 111.Lord: 112.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 113.LordK: 114.quad 0xccd1c8aaee00bc4f 115.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 116 117// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 118.globl ecp_nistz256_to_mont 119.type ecp_nistz256_to_mont,%function 120.align 6 121ecp_nistz256_to_mont: 122 stp x29,x30,[sp,#-32]! 123 add x29,sp,#0 124 stp x19,x20,[sp,#16] 125 126 ldr $bi,.LRR // bp[0] 127 ldp $a0,$a1,[$ap] 128 ldp $a2,$a3,[$ap,#16] 129 ldr $poly1,.Lpoly+8 130 ldr $poly3,.Lpoly+24 131 adr $bp,.LRR // &bp[0] 132 133 bl __ecp_nistz256_mul_mont 134 135 ldp x19,x20,[sp,#16] 136 ldp x29,x30,[sp],#32 137 ret 138.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 139 140// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 141.globl ecp_nistz256_from_mont 142.type ecp_nistz256_from_mont,%function 143.align 4 144ecp_nistz256_from_mont: 145 stp x29,x30,[sp,#-32]! 146 add x29,sp,#0 147 stp x19,x20,[sp,#16] 148 149 mov $bi,#1 // bp[0] 150 ldp $a0,$a1,[$ap] 151 ldp $a2,$a3,[$ap,#16] 152 ldr $poly1,.Lpoly+8 153 ldr $poly3,.Lpoly+24 154 adr $bp,.Lone // &bp[0] 155 156 bl __ecp_nistz256_mul_mont 157 158 ldp x19,x20,[sp,#16] 159 ldp x29,x30,[sp],#32 160 ret 161.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 162 163// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 164// const BN_ULONG x2[4]); 165.globl ecp_nistz256_mul_mont 166.type ecp_nistz256_mul_mont,%function 167.align 4 168ecp_nistz256_mul_mont: 169 stp x29,x30,[sp,#-32]! 170 add x29,sp,#0 171 stp x19,x20,[sp,#16] 172 173 ldr $bi,[$bp] // bp[0] 174 ldp $a0,$a1,[$ap] 175 ldp $a2,$a3,[$ap,#16] 176 ldr $poly1,.Lpoly+8 177 ldr $poly3,.Lpoly+24 178 179 bl __ecp_nistz256_mul_mont 180 181 ldp x19,x20,[sp,#16] 182 ldp x29,x30,[sp],#32 183 ret 184.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 185 186// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 187.globl ecp_nistz256_sqr_mont 188.type ecp_nistz256_sqr_mont,%function 189.align 4 190ecp_nistz256_sqr_mont: 191 stp x29,x30,[sp,#-32]! 192 add x29,sp,#0 193 stp x19,x20,[sp,#16] 194 195 ldp $a0,$a1,[$ap] 196 ldp $a2,$a3,[$ap,#16] 197 ldr $poly1,.Lpoly+8 198 ldr $poly3,.Lpoly+24 199 200 bl __ecp_nistz256_sqr_mont 201 202 ldp x19,x20,[sp,#16] 203 ldp x29,x30,[sp],#32 204 ret 205.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 206 207// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 208// const BN_ULONG x2[4]); 209.globl ecp_nistz256_add 210.type ecp_nistz256_add,%function 211.align 4 212ecp_nistz256_add: 213 stp x29,x30,[sp,#-16]! 214 add x29,sp,#0 215 216 ldp $acc0,$acc1,[$ap] 217 ldp $t0,$t1,[$bp] 218 ldp $acc2,$acc3,[$ap,#16] 219 ldp $t2,$t3,[$bp,#16] 220 ldr $poly1,.Lpoly+8 221 ldr $poly3,.Lpoly+24 222 223 bl __ecp_nistz256_add 224 225 ldp x29,x30,[sp],#16 226 ret 227.size ecp_nistz256_add,.-ecp_nistz256_add 228 229// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 230.globl ecp_nistz256_div_by_2 231.type ecp_nistz256_div_by_2,%function 232.align 4 233ecp_nistz256_div_by_2: 234 stp x29,x30,[sp,#-16]! 235 add x29,sp,#0 236 237 ldp $acc0,$acc1,[$ap] 238 ldp $acc2,$acc3,[$ap,#16] 239 ldr $poly1,.Lpoly+8 240 ldr $poly3,.Lpoly+24 241 242 bl __ecp_nistz256_div_by_2 243 244 ldp x29,x30,[sp],#16 245 ret 246.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 247 248// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 249.globl ecp_nistz256_mul_by_2 250.type ecp_nistz256_mul_by_2,%function 251.align 4 252ecp_nistz256_mul_by_2: 253 stp x29,x30,[sp,#-16]! 254 add x29,sp,#0 255 256 ldp $acc0,$acc1,[$ap] 257 ldp $acc2,$acc3,[$ap,#16] 258 ldr $poly1,.Lpoly+8 259 ldr $poly3,.Lpoly+24 260 mov $t0,$acc0 261 mov $t1,$acc1 262 mov $t2,$acc2 263 mov $t3,$acc3 264 265 bl __ecp_nistz256_add // ret = a+a // 2*a 266 267 ldp x29,x30,[sp],#16 268 ret 269.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 270 271// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 272.globl ecp_nistz256_mul_by_3 273.type ecp_nistz256_mul_by_3,%function 274.align 4 275ecp_nistz256_mul_by_3: 276 stp x29,x30,[sp,#-16]! 277 add x29,sp,#0 278 279 ldp $acc0,$acc1,[$ap] 280 ldp $acc2,$acc3,[$ap,#16] 281 ldr $poly1,.Lpoly+8 282 ldr $poly3,.Lpoly+24 283 mov $t0,$acc0 284 mov $t1,$acc1 285 mov $t2,$acc2 286 mov $t3,$acc3 287 mov $a0,$acc0 288 mov $a1,$acc1 289 mov $a2,$acc2 290 mov $a3,$acc3 291 292 bl __ecp_nistz256_add // ret = a+a // 2*a 293 294 mov $t0,$a0 295 mov $t1,$a1 296 mov $t2,$a2 297 mov $t3,$a3 298 299 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a 300 301 ldp x29,x30,[sp],#16 302 ret 303.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 304 305// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 306// const BN_ULONG x2[4]); 307.globl ecp_nistz256_sub 308.type ecp_nistz256_sub,%function 309.align 4 310ecp_nistz256_sub: 311 stp x29,x30,[sp,#-16]! 312 add x29,sp,#0 313 314 ldp $acc0,$acc1,[$ap] 315 ldp $acc2,$acc3,[$ap,#16] 316 ldr $poly1,.Lpoly+8 317 ldr $poly3,.Lpoly+24 318 319 bl __ecp_nistz256_sub_from 320 321 ldp x29,x30,[sp],#16 322 ret 323.size ecp_nistz256_sub,.-ecp_nistz256_sub 324 325// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 326.globl ecp_nistz256_neg 327.type ecp_nistz256_neg,%function 328.align 4 329ecp_nistz256_neg: 330 stp x29,x30,[sp,#-16]! 331 add x29,sp,#0 332 333 mov $bp,$ap 334 mov $acc0,xzr // a = 0 335 mov $acc1,xzr 336 mov $acc2,xzr 337 mov $acc3,xzr 338 ldr $poly1,.Lpoly+8 339 ldr $poly3,.Lpoly+24 340 341 bl __ecp_nistz256_sub_from 342 343 ldp x29,x30,[sp],#16 344 ret 345.size ecp_nistz256_neg,.-ecp_nistz256_neg 346 347// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 348// to $a0-$a3 and b[0] - to $bi 349.type __ecp_nistz256_mul_mont,%function 350.align 4 351__ecp_nistz256_mul_mont: 352 mul $acc0,$a0,$bi // a[0]*b[0] 353 umulh $t0,$a0,$bi 354 355 mul $acc1,$a1,$bi // a[1]*b[0] 356 umulh $t1,$a1,$bi 357 358 mul $acc2,$a2,$bi // a[2]*b[0] 359 umulh $t2,$a2,$bi 360 361 mul $acc3,$a3,$bi // a[3]*b[0] 362 umulh $t3,$a3,$bi 363 ldr $bi,[$bp,#8] // b[1] 364 365 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 366 lsl $t0,$acc0,#32 367 adcs $acc2,$acc2,$t1 368 lsr $t1,$acc0,#32 369 adcs $acc3,$acc3,$t2 370 adc $acc4,xzr,$t3 371 mov $acc5,xzr 372___ 373for($i=1;$i<4;$i++) { 374 # Reduction iteration is normally performed by accumulating 375 # result of multiplication of modulus by "magic" digit [and 376 # omitting least significant word, which is guaranteed to 377 # be 0], but thanks to special form of modulus and "magic" 378 # digit being equal to least significant word, it can be 379 # performed with additions and subtractions alone. Indeed: 380 # 381 # ffff0001.00000000.0000ffff.ffffffff 382 # * abcdefgh 383 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 384 # 385 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 386 # rewrite above as: 387 # 388 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 389 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 390 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 391 # 392 # or marking redundant operations: 393 # 394 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 395 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 396 # - 0000abcd.efgh0000.--------.--------.-------- 397 398$code.=<<___; 399 subs $t2,$acc0,$t0 // "*0xffff0001" 400 sbc $t3,$acc0,$t1 401 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 402 mul $t0,$a0,$bi // lo(a[0]*b[i]) 403 adcs $acc1,$acc2,$t1 404 mul $t1,$a1,$bi // lo(a[1]*b[i]) 405 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 406 mul $t2,$a2,$bi // lo(a[2]*b[i]) 407 adcs $acc3,$acc4,$t3 408 mul $t3,$a3,$bi // lo(a[3]*b[i]) 409 adc $acc4,$acc5,xzr 410 411 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 412 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 413 adcs $acc1,$acc1,$t1 414 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 415 adcs $acc2,$acc2,$t2 416 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 417 adcs $acc3,$acc3,$t3 418 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 419 adc $acc4,$acc4,xzr 420___ 421$code.=<<___ if ($i<3); 422 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 423___ 424$code.=<<___; 425 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 426 lsl $t0,$acc0,#32 427 adcs $acc2,$acc2,$t1 428 lsr $t1,$acc0,#32 429 adcs $acc3,$acc3,$t2 430 adcs $acc4,$acc4,$t3 431 adc $acc5,xzr,xzr 432___ 433} 434$code.=<<___; 435 // last reduction 436 subs $t2,$acc0,$t0 // "*0xffff0001" 437 sbc $t3,$acc0,$t1 438 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 439 adcs $acc1,$acc2,$t1 440 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 441 adcs $acc3,$acc4,$t3 442 adc $acc4,$acc5,xzr 443 444 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 445 sbcs $t1,$acc1,$poly1 446 sbcs $t2,$acc2,xzr 447 sbcs $t3,$acc3,$poly3 448 sbcs xzr,$acc4,xzr // did it borrow? 449 450 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 451 csel $acc1,$acc1,$t1,lo 452 csel $acc2,$acc2,$t2,lo 453 stp $acc0,$acc1,[$rp] 454 csel $acc3,$acc3,$t3,lo 455 stp $acc2,$acc3,[$rp,#16] 456 457 ret 458.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 459 460// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 461// to $a0-$a3 462.type __ecp_nistz256_sqr_mont,%function 463.align 4 464__ecp_nistz256_sqr_mont: 465 // | | | | | |a1*a0| | 466 // | | | | |a2*a0| | | 467 // | |a3*a2|a3*a0| | | | 468 // | | | |a2*a1| | | | 469 // | | |a3*a1| | | | | 470 // *| | | | | | | | 2| 471 // +|a3*a3|a2*a2|a1*a1|a0*a0| 472 // |--+--+--+--+--+--+--+--| 473 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 474 // 475 // "can't overflow" below mark carrying into high part of 476 // multiplication result, which can't overflow, because it 477 // can never be all ones. 478 479 mul $acc1,$a1,$a0 // a[1]*a[0] 480 umulh $t1,$a1,$a0 481 mul $acc2,$a2,$a0 // a[2]*a[0] 482 umulh $t2,$a2,$a0 483 mul $acc3,$a3,$a0 // a[3]*a[0] 484 umulh $acc4,$a3,$a0 485 486 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 487 mul $t0,$a2,$a1 // a[2]*a[1] 488 umulh $t1,$a2,$a1 489 adcs $acc3,$acc3,$t2 490 mul $t2,$a3,$a1 // a[3]*a[1] 491 umulh $t3,$a3,$a1 492 adc $acc4,$acc4,xzr // can't overflow 493 494 mul $acc5,$a3,$a2 // a[3]*a[2] 495 umulh $acc6,$a3,$a2 496 497 adds $t1,$t1,$t2 // accumulate high parts of multiplication 498 mul $acc0,$a0,$a0 // a[0]*a[0] 499 adc $t2,$t3,xzr // can't overflow 500 501 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 502 umulh $a0,$a0,$a0 503 adcs $acc4,$acc4,$t1 504 mul $t1,$a1,$a1 // a[1]*a[1] 505 adcs $acc5,$acc5,$t2 506 umulh $a1,$a1,$a1 507 adc $acc6,$acc6,xzr // can't overflow 508 509 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 510 mul $t2,$a2,$a2 // a[2]*a[2] 511 adcs $acc2,$acc2,$acc2 512 umulh $a2,$a2,$a2 513 adcs $acc3,$acc3,$acc3 514 mul $t3,$a3,$a3 // a[3]*a[3] 515 adcs $acc4,$acc4,$acc4 516 umulh $a3,$a3,$a3 517 adcs $acc5,$acc5,$acc5 518 adcs $acc6,$acc6,$acc6 519 adc $acc7,xzr,xzr 520 521 adds $acc1,$acc1,$a0 // +a[i]*a[i] 522 adcs $acc2,$acc2,$t1 523 adcs $acc3,$acc3,$a1 524 adcs $acc4,$acc4,$t2 525 adcs $acc5,$acc5,$a2 526 lsl $t0,$acc0,#32 527 adcs $acc6,$acc6,$t3 528 lsr $t1,$acc0,#32 529 adc $acc7,$acc7,$a3 530___ 531for($i=0;$i<3;$i++) { # reductions, see commentary in 532 # multiplication for details 533$code.=<<___; 534 subs $t2,$acc0,$t0 // "*0xffff0001" 535 sbc $t3,$acc0,$t1 536 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 537 adcs $acc1,$acc2,$t1 538 lsl $t0,$acc0,#32 539 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 540 lsr $t1,$acc0,#32 541 adc $acc3,$t3,xzr // can't overflow 542___ 543} 544$code.=<<___; 545 subs $t2,$acc0,$t0 // "*0xffff0001" 546 sbc $t3,$acc0,$t1 547 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 548 adcs $acc1,$acc2,$t1 549 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 550 adc $acc3,$t3,xzr // can't overflow 551 552 adds $acc0,$acc0,$acc4 // accumulate upper half 553 adcs $acc1,$acc1,$acc5 554 adcs $acc2,$acc2,$acc6 555 adcs $acc3,$acc3,$acc7 556 adc $acc4,xzr,xzr 557 558 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 559 sbcs $t1,$acc1,$poly1 560 sbcs $t2,$acc2,xzr 561 sbcs $t3,$acc3,$poly3 562 sbcs xzr,$acc4,xzr // did it borrow? 563 564 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 565 csel $acc1,$acc1,$t1,lo 566 csel $acc2,$acc2,$t2,lo 567 stp $acc0,$acc1,[$rp] 568 csel $acc3,$acc3,$t3,lo 569 stp $acc2,$acc3,[$rp,#16] 570 571 ret 572.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 573 574// Note that __ecp_nistz256_add expects both input vectors pre-loaded to 575// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 576// contexts, e.g. in multiplication by 2 and 3... 577.type __ecp_nistz256_add,%function 578.align 4 579__ecp_nistz256_add: 580 adds $acc0,$acc0,$t0 // ret = a+b 581 adcs $acc1,$acc1,$t1 582 adcs $acc2,$acc2,$t2 583 adcs $acc3,$acc3,$t3 584 adc $ap,xzr,xzr // zap $ap 585 586 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 587 sbcs $t1,$acc1,$poly1 588 sbcs $t2,$acc2,xzr 589 sbcs $t3,$acc3,$poly3 590 sbcs xzr,$ap,xzr // did subtraction borrow? 591 592 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 593 csel $acc1,$acc1,$t1,lo 594 csel $acc2,$acc2,$t2,lo 595 stp $acc0,$acc1,[$rp] 596 csel $acc3,$acc3,$t3,lo 597 stp $acc2,$acc3,[$rp,#16] 598 599 ret 600.size __ecp_nistz256_add,.-__ecp_nistz256_add 601 602.type __ecp_nistz256_sub_from,%function 603.align 4 604__ecp_nistz256_sub_from: 605 ldp $t0,$t1,[$bp] 606 ldp $t2,$t3,[$bp,#16] 607 subs $acc0,$acc0,$t0 // ret = a-b 608 sbcs $acc1,$acc1,$t1 609 sbcs $acc2,$acc2,$t2 610 sbcs $acc3,$acc3,$t3 611 sbc $ap,xzr,xzr // zap $ap 612 613 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 614 adcs $t1,$acc1,$poly1 615 adcs $t2,$acc2,xzr 616 adc $t3,$acc3,$poly3 617 cmp $ap,xzr // did subtraction borrow? 618 619 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 620 csel $acc1,$acc1,$t1,eq 621 csel $acc2,$acc2,$t2,eq 622 stp $acc0,$acc1,[$rp] 623 csel $acc3,$acc3,$t3,eq 624 stp $acc2,$acc3,[$rp,#16] 625 626 ret 627.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 628 629.type __ecp_nistz256_sub_morf,%function 630.align 4 631__ecp_nistz256_sub_morf: 632 ldp $t0,$t1,[$bp] 633 ldp $t2,$t3,[$bp,#16] 634 subs $acc0,$t0,$acc0 // ret = b-a 635 sbcs $acc1,$t1,$acc1 636 sbcs $acc2,$t2,$acc2 637 sbcs $acc3,$t3,$acc3 638 sbc $ap,xzr,xzr // zap $ap 639 640 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 641 adcs $t1,$acc1,$poly1 642 adcs $t2,$acc2,xzr 643 adc $t3,$acc3,$poly3 644 cmp $ap,xzr // did subtraction borrow? 645 646 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 647 csel $acc1,$acc1,$t1,eq 648 csel $acc2,$acc2,$t2,eq 649 stp $acc0,$acc1,[$rp] 650 csel $acc3,$acc3,$t3,eq 651 stp $acc2,$acc3,[$rp,#16] 652 653 ret 654.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 655 656.type __ecp_nistz256_div_by_2,%function 657.align 4 658__ecp_nistz256_div_by_2: 659 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 660 adcs $t1,$acc1,$poly1 661 adcs $t2,$acc2,xzr 662 adcs $t3,$acc3,$poly3 663 adc $ap,xzr,xzr // zap $ap 664 tst $acc0,#1 // is a even? 665 666 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 667 csel $acc1,$acc1,$t1,eq 668 csel $acc2,$acc2,$t2,eq 669 csel $acc3,$acc3,$t3,eq 670 csel $ap,xzr,$ap,eq 671 672 lsr $acc0,$acc0,#1 // ret >>= 1 673 orr $acc0,$acc0,$acc1,lsl#63 674 lsr $acc1,$acc1,#1 675 orr $acc1,$acc1,$acc2,lsl#63 676 lsr $acc2,$acc2,#1 677 orr $acc2,$acc2,$acc3,lsl#63 678 lsr $acc3,$acc3,#1 679 stp $acc0,$acc1,[$rp] 680 orr $acc3,$acc3,$ap,lsl#63 681 stp $acc2,$acc3,[$rp,#16] 682 683 ret 684.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 685___ 686######################################################################## 687# following subroutines are "literal" implementation of those found in 688# ecp_nistz256.c 689# 690######################################################################## 691# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 692# 693{ 694my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 695# above map() describes stack layout with 4 temporary 696# 256-bit vectors on top. 697my ($rp_real,$ap_real) = map("x$_",(21,22)); 698 699$code.=<<___; 700.globl ecp_nistz256_point_double 701.type ecp_nistz256_point_double,%function 702.align 5 703ecp_nistz256_point_double: 704 stp x29,x30,[sp,#-80]! 705 add x29,sp,#0 706 stp x19,x20,[sp,#16] 707 stp x21,x22,[sp,#32] 708 sub sp,sp,#32*4 709 710.Ldouble_shortcut: 711 ldp $acc0,$acc1,[$ap,#32] 712 mov $rp_real,$rp 713 ldp $acc2,$acc3,[$ap,#48] 714 mov $ap_real,$ap 715 ldr $poly1,.Lpoly+8 716 mov $t0,$acc0 717 ldr $poly3,.Lpoly+24 718 mov $t1,$acc1 719 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 720 mov $t2,$acc2 721 mov $t3,$acc3 722 ldp $a2,$a3,[$ap_real,#64+16] 723 add $rp,sp,#$S 724 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); 725 726 add $rp,sp,#$Zsqr 727 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 728 729 ldp $t0,$t1,[$ap_real] 730 ldp $t2,$t3,[$ap_real,#16] 731 mov $a0,$acc0 // put Zsqr aside for p256_sub 732 mov $a1,$acc1 733 mov $a2,$acc2 734 mov $a3,$acc3 735 add $rp,sp,#$M 736 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); 737 738 add $bp,$ap_real,#0 739 mov $acc0,$a0 // restore Zsqr 740 mov $acc1,$a1 741 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 742 mov $acc2,$a2 743 mov $acc3,$a3 744 ldp $a2,$a3,[sp,#$S+16] 745 add $rp,sp,#$Zsqr 746 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 747 748 add $rp,sp,#$S 749 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 750 751 ldr $bi,[$ap_real,#32] 752 ldp $a0,$a1,[$ap_real,#64] 753 ldp $a2,$a3,[$ap_real,#64+16] 754 add $bp,$ap_real,#32 755 add $rp,sp,#$tmp0 756 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 757 758 mov $t0,$acc0 759 mov $t1,$acc1 760 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 761 mov $t2,$acc2 762 mov $t3,$acc3 763 ldp $a2,$a3,[sp,#$S+16] 764 add $rp,$rp_real,#64 765 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); 766 767 add $rp,sp,#$tmp0 768 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 769 770 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 771 ldp $a0,$a1,[sp,#$M] 772 ldp $a2,$a3,[sp,#$M+16] 773 add $rp,$rp_real,#32 774 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 775 776 add $bp,sp,#$Zsqr 777 add $rp,sp,#$M 778 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 779 780 mov $t0,$acc0 // duplicate M 781 mov $t1,$acc1 782 mov $t2,$acc2 783 mov $t3,$acc3 784 mov $a0,$acc0 // put M aside 785 mov $a1,$acc1 786 mov $a2,$acc2 787 mov $a3,$acc3 788 add $rp,sp,#$M 789 bl __ecp_nistz256_add 790 mov $t0,$a0 // restore M 791 mov $t1,$a1 792 ldr $bi,[$ap_real] // forward load for p256_mul_mont 793 mov $t2,$a2 794 ldp $a0,$a1,[sp,#$S] 795 mov $t3,$a3 796 ldp $a2,$a3,[sp,#$S+16] 797 bl __ecp_nistz256_add // p256_mul_by_3(M, M); 798 799 add $bp,$ap_real,#0 800 add $rp,sp,#$S 801 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 802 803 mov $t0,$acc0 804 mov $t1,$acc1 805 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 806 mov $t2,$acc2 807 mov $t3,$acc3 808 ldp $a2,$a3,[sp,#$M+16] 809 add $rp,sp,#$tmp0 810 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); 811 812 add $rp,$rp_real,#0 813 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 814 815 add $bp,sp,#$tmp0 816 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 817 818 add $bp,sp,#$S 819 add $rp,sp,#$S 820 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 821 822 ldr $bi,[sp,#$M] 823 mov $a0,$acc0 // copy S 824 mov $a1,$acc1 825 mov $a2,$acc2 826 mov $a3,$acc3 827 add $bp,sp,#$M 828 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 829 830 add $bp,$rp_real,#32 831 add $rp,$rp_real,#32 832 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 833 834 add sp,x29,#0 // destroy frame 835 ldp x19,x20,[x29,#16] 836 ldp x21,x22,[x29,#32] 837 ldp x29,x30,[sp],#80 838 ret 839.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 840___ 841} 842 843######################################################################## 844# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 845# const P256_POINT *in2); 846{ 847my ($res_x,$res_y,$res_z, 848 $H,$Hsqr,$R,$Rsqr,$Hcub, 849 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 850my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 851# above map() describes stack layout with 12 temporary 852# 256-bit vectors on top. 853my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 854 855$code.=<<___; 856.globl ecp_nistz256_point_add 857.type ecp_nistz256_point_add,%function 858.align 5 859ecp_nistz256_point_add: 860 stp x29,x30,[sp,#-80]! 861 add x29,sp,#0 862 stp x19,x20,[sp,#16] 863 stp x21,x22,[sp,#32] 864 stp x23,x24,[sp,#48] 865 stp x25,x26,[sp,#64] 866 sub sp,sp,#32*12 867 868 ldp $a0,$a1,[$bp,#64] // in2_z 869 ldp $a2,$a3,[$bp,#64+16] 870 mov $rp_real,$rp 871 mov $ap_real,$ap 872 mov $bp_real,$bp 873 ldr $poly1,.Lpoly+8 874 ldr $poly3,.Lpoly+24 875 orr $t0,$a0,$a1 876 orr $t2,$a2,$a3 877 orr $in2infty,$t0,$t2 878 cmp $in2infty,#0 879 csetm $in2infty,ne // !in2infty 880 add $rp,sp,#$Z2sqr 881 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 882 883 ldp $a0,$a1,[$ap_real,#64] // in1_z 884 ldp $a2,$a3,[$ap_real,#64+16] 885 orr $t0,$a0,$a1 886 orr $t2,$a2,$a3 887 orr $in1infty,$t0,$t2 888 cmp $in1infty,#0 889 csetm $in1infty,ne // !in1infty 890 add $rp,sp,#$Z1sqr 891 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 892 893 ldr $bi,[$bp_real,#64] 894 ldp $a0,$a1,[sp,#$Z2sqr] 895 ldp $a2,$a3,[sp,#$Z2sqr+16] 896 add $bp,$bp_real,#64 897 add $rp,sp,#$S1 898 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 899 900 ldr $bi,[$ap_real,#64] 901 ldp $a0,$a1,[sp,#$Z1sqr] 902 ldp $a2,$a3,[sp,#$Z1sqr+16] 903 add $bp,$ap_real,#64 904 add $rp,sp,#$S2 905 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 906 907 ldr $bi,[$ap_real,#32] 908 ldp $a0,$a1,[sp,#$S1] 909 ldp $a2,$a3,[sp,#$S1+16] 910 add $bp,$ap_real,#32 911 add $rp,sp,#$S1 912 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 913 914 ldr $bi,[$bp_real,#32] 915 ldp $a0,$a1,[sp,#$S2] 916 ldp $a2,$a3,[sp,#$S2+16] 917 add $bp,$bp_real,#32 918 add $rp,sp,#$S2 919 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 920 921 add $bp,sp,#$S1 922 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 923 ldp $a0,$a1,[$ap_real] 924 ldp $a2,$a3,[$ap_real,#16] 925 add $rp,sp,#$R 926 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 927 928 orr $acc0,$acc0,$acc1 // see if result is zero 929 orr $acc2,$acc2,$acc3 930 orr $temp,$acc0,$acc2 931 932 add $bp,sp,#$Z2sqr 933 add $rp,sp,#$U1 934 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 935 936 ldr $bi,[sp,#$Z1sqr] 937 ldp $a0,$a1,[$bp_real] 938 ldp $a2,$a3,[$bp_real,#16] 939 add $bp,sp,#$Z1sqr 940 add $rp,sp,#$U2 941 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 942 943 add $bp,sp,#$U1 944 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 945 ldp $a2,$a3,[sp,#$R+16] 946 add $rp,sp,#$H 947 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 948 949 orr $acc0,$acc0,$acc1 // see if result is zero 950 orr $acc2,$acc2,$acc3 951 orr $acc0,$acc0,$acc2 952 tst $acc0,$acc0 953 b.ne .Ladd_proceed // is_equal(U1,U2)? 954 955 tst $in1infty,$in2infty 956 b.eq .Ladd_proceed // (in1infty || in2infty)? 957 958 tst $temp,$temp 959 b.eq .Ladd_double // is_equal(S1,S2)? 960 961 eor $a0,$a0,$a0 962 eor $a1,$a1,$a1 963 stp $a0,$a1,[$rp_real] 964 stp $a0,$a1,[$rp_real,#16] 965 stp $a0,$a1,[$rp_real,#32] 966 stp $a0,$a1,[$rp_real,#48] 967 stp $a0,$a1,[$rp_real,#64] 968 stp $a0,$a1,[$rp_real,#80] 969 b .Ladd_done 970 971.align 4 972.Ladd_double: 973 mov $ap,$ap_real 974 mov $rp,$rp_real 975 ldp x23,x24,[x29,#48] 976 ldp x25,x26,[x29,#64] 977 add sp,sp,#32*(12-4) // difference in stack frames 978 b .Ldouble_shortcut 979 980.align 4 981.Ladd_proceed: 982 add $rp,sp,#$Rsqr 983 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 984 985 ldr $bi,[$ap_real,#64] 986 ldp $a0,$a1,[sp,#$H] 987 ldp $a2,$a3,[sp,#$H+16] 988 add $bp,$ap_real,#64 989 add $rp,sp,#$res_z 990 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 991 992 ldp $a0,$a1,[sp,#$H] 993 ldp $a2,$a3,[sp,#$H+16] 994 add $rp,sp,#$Hsqr 995 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 996 997 ldr $bi,[$bp_real,#64] 998 ldp $a0,$a1,[sp,#$res_z] 999 ldp $a2,$a3,[sp,#$res_z+16] 1000 add $bp,$bp_real,#64 1001 add $rp,sp,#$res_z 1002 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 1003 1004 ldr $bi,[sp,#$H] 1005 ldp $a0,$a1,[sp,#$Hsqr] 1006 ldp $a2,$a3,[sp,#$Hsqr+16] 1007 add $bp,sp,#$H 1008 add $rp,sp,#$Hcub 1009 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1010 1011 ldr $bi,[sp,#$Hsqr] 1012 ldp $a0,$a1,[sp,#$U1] 1013 ldp $a2,$a3,[sp,#$U1+16] 1014 add $bp,sp,#$Hsqr 1015 add $rp,sp,#$U2 1016 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 1017 1018 mov $t0,$acc0 1019 mov $t1,$acc1 1020 mov $t2,$acc2 1021 mov $t3,$acc3 1022 add $rp,sp,#$Hsqr 1023 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1024 1025 add $bp,sp,#$Rsqr 1026 add $rp,sp,#$res_x 1027 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1028 1029 add $bp,sp,#$Hcub 1030 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1031 1032 add $bp,sp,#$U2 1033 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 1034 ldp $a0,$a1,[sp,#$S1] 1035 ldp $a2,$a3,[sp,#$S1+16] 1036 add $rp,sp,#$res_y 1037 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1038 1039 add $bp,sp,#$Hcub 1040 add $rp,sp,#$S2 1041 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 1042 1043 ldr $bi,[sp,#$R] 1044 ldp $a0,$a1,[sp,#$res_y] 1045 ldp $a2,$a3,[sp,#$res_y+16] 1046 add $bp,sp,#$R 1047 add $rp,sp,#$res_y 1048 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1049 1050 add $bp,sp,#$S2 1051 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1052 1053 ldp $a0,$a1,[sp,#$res_x] // res 1054 ldp $a2,$a3,[sp,#$res_x+16] 1055 ldp $t0,$t1,[$bp_real] // in2 1056 ldp $t2,$t3,[$bp_real,#16] 1057___ 1058for($i=0;$i<64;$i+=32) { # conditional moves 1059$code.=<<___; 1060 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1061 cmp $in1infty,#0 // !$in1intfy, remember? 1062 ldp $acc2,$acc3,[$ap_real,#$i+16] 1063 csel $t0,$a0,$t0,ne 1064 csel $t1,$a1,$t1,ne 1065 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1066 csel $t2,$a2,$t2,ne 1067 csel $t3,$a3,$t3,ne 1068 cmp $in2infty,#0 // !$in2intfy, remember? 1069 ldp $a2,$a3,[sp,#$res_x+$i+48] 1070 csel $acc0,$t0,$acc0,ne 1071 csel $acc1,$t1,$acc1,ne 1072 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1073 csel $acc2,$t2,$acc2,ne 1074 csel $acc3,$t3,$acc3,ne 1075 ldp $t2,$t3,[$bp_real,#$i+48] 1076 stp $acc0,$acc1,[$rp_real,#$i] 1077 stp $acc2,$acc3,[$rp_real,#$i+16] 1078___ 1079} 1080$code.=<<___; 1081 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1082 cmp $in1infty,#0 // !$in1intfy, remember? 1083 ldp $acc2,$acc3,[$ap_real,#$i+16] 1084 csel $t0,$a0,$t0,ne 1085 csel $t1,$a1,$t1,ne 1086 csel $t2,$a2,$t2,ne 1087 csel $t3,$a3,$t3,ne 1088 cmp $in2infty,#0 // !$in2intfy, remember? 1089 csel $acc0,$t0,$acc0,ne 1090 csel $acc1,$t1,$acc1,ne 1091 csel $acc2,$t2,$acc2,ne 1092 csel $acc3,$t3,$acc3,ne 1093 stp $acc0,$acc1,[$rp_real,#$i] 1094 stp $acc2,$acc3,[$rp_real,#$i+16] 1095 1096.Ladd_done: 1097 add sp,x29,#0 // destroy frame 1098 ldp x19,x20,[x29,#16] 1099 ldp x21,x22,[x29,#32] 1100 ldp x23,x24,[x29,#48] 1101 ldp x25,x26,[x29,#64] 1102 ldp x29,x30,[sp],#80 1103 ret 1104.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1105___ 1106} 1107 1108######################################################################## 1109# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1110# const P256_POINT_AFFINE *in2); 1111{ 1112my ($res_x,$res_y,$res_z, 1113 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1114my $Z1sqr = $S2; 1115# above map() describes stack layout with 10 temporary 1116# 256-bit vectors on top. 1117my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 1118 1119$code.=<<___; 1120.globl ecp_nistz256_point_add_affine 1121.type ecp_nistz256_point_add_affine,%function 1122.align 5 1123ecp_nistz256_point_add_affine: 1124 stp x29,x30,[sp,#-80]! 1125 add x29,sp,#0 1126 stp x19,x20,[sp,#16] 1127 stp x21,x22,[sp,#32] 1128 stp x23,x24,[sp,#48] 1129 stp x25,x26,[sp,#64] 1130 sub sp,sp,#32*10 1131 1132 mov $rp_real,$rp 1133 mov $ap_real,$ap 1134 mov $bp_real,$bp 1135 ldr $poly1,.Lpoly+8 1136 ldr $poly3,.Lpoly+24 1137 1138 ldp $a0,$a1,[$ap,#64] // in1_z 1139 ldp $a2,$a3,[$ap,#64+16] 1140 orr $t0,$a0,$a1 1141 orr $t2,$a2,$a3 1142 orr $in1infty,$t0,$t2 1143 cmp $in1infty,#0 1144 csetm $in1infty,ne // !in1infty 1145 1146 ldp $acc0,$acc1,[$bp] // in2_x 1147 ldp $acc2,$acc3,[$bp,#16] 1148 ldp $t0,$t1,[$bp,#32] // in2_y 1149 ldp $t2,$t3,[$bp,#48] 1150 orr $acc0,$acc0,$acc1 1151 orr $acc2,$acc2,$acc3 1152 orr $t0,$t0,$t1 1153 orr $t2,$t2,$t3 1154 orr $acc0,$acc0,$acc2 1155 orr $t0,$t0,$t2 1156 orr $in2infty,$acc0,$t0 1157 cmp $in2infty,#0 1158 csetm $in2infty,ne // !in2infty 1159 1160 add $rp,sp,#$Z1sqr 1161 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1162 1163 mov $a0,$acc0 1164 mov $a1,$acc1 1165 mov $a2,$acc2 1166 mov $a3,$acc3 1167 ldr $bi,[$bp_real] 1168 add $bp,$bp_real,#0 1169 add $rp,sp,#$U2 1170 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1171 1172 add $bp,$ap_real,#0 1173 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 1174 ldp $a0,$a1,[sp,#$Z1sqr] 1175 ldp $a2,$a3,[sp,#$Z1sqr+16] 1176 add $rp,sp,#$H 1177 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1178 1179 add $bp,$ap_real,#64 1180 add $rp,sp,#$S2 1181 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1182 1183 ldr $bi,[$ap_real,#64] 1184 ldp $a0,$a1,[sp,#$H] 1185 ldp $a2,$a3,[sp,#$H+16] 1186 add $bp,$ap_real,#64 1187 add $rp,sp,#$res_z 1188 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1189 1190 ldr $bi,[$bp_real,#32] 1191 ldp $a0,$a1,[sp,#$S2] 1192 ldp $a2,$a3,[sp,#$S2+16] 1193 add $bp,$bp_real,#32 1194 add $rp,sp,#$S2 1195 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1196 1197 add $bp,$ap_real,#32 1198 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1199 ldp $a2,$a3,[sp,#$H+16] 1200 add $rp,sp,#$R 1201 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1202 1203 add $rp,sp,#$Hsqr 1204 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1205 1206 ldp $a0,$a1,[sp,#$R] 1207 ldp $a2,$a3,[sp,#$R+16] 1208 add $rp,sp,#$Rsqr 1209 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1210 1211 ldr $bi,[sp,#$H] 1212 ldp $a0,$a1,[sp,#$Hsqr] 1213 ldp $a2,$a3,[sp,#$Hsqr+16] 1214 add $bp,sp,#$H 1215 add $rp,sp,#$Hcub 1216 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1217 1218 ldr $bi,[$ap_real] 1219 ldp $a0,$a1,[sp,#$Hsqr] 1220 ldp $a2,$a3,[sp,#$Hsqr+16] 1221 add $bp,$ap_real,#0 1222 add $rp,sp,#$U2 1223 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1224 1225 mov $t0,$acc0 1226 mov $t1,$acc1 1227 mov $t2,$acc2 1228 mov $t3,$acc3 1229 add $rp,sp,#$Hsqr 1230 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1231 1232 add $bp,sp,#$Rsqr 1233 add $rp,sp,#$res_x 1234 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1235 1236 add $bp,sp,#$Hcub 1237 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1238 1239 add $bp,sp,#$U2 1240 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1241 ldp $a0,$a1,[sp,#$Hcub] 1242 ldp $a2,$a3,[sp,#$Hcub+16] 1243 add $rp,sp,#$res_y 1244 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1245 1246 add $bp,$ap_real,#32 1247 add $rp,sp,#$S2 1248 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1249 1250 ldr $bi,[sp,#$R] 1251 ldp $a0,$a1,[sp,#$res_y] 1252 ldp $a2,$a3,[sp,#$res_y+16] 1253 add $bp,sp,#$R 1254 add $rp,sp,#$res_y 1255 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1256 1257 add $bp,sp,#$S2 1258 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1259 1260 ldp $a0,$a1,[sp,#$res_x] // res 1261 ldp $a2,$a3,[sp,#$res_x+16] 1262 ldp $t0,$t1,[$bp_real] // in2 1263 ldp $t2,$t3,[$bp_real,#16] 1264___ 1265for($i=0;$i<64;$i+=32) { # conditional moves 1266$code.=<<___; 1267 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1268 cmp $in1infty,#0 // !$in1intfy, remember? 1269 ldp $acc2,$acc3,[$ap_real,#$i+16] 1270 csel $t0,$a0,$t0,ne 1271 csel $t1,$a1,$t1,ne 1272 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1273 csel $t2,$a2,$t2,ne 1274 csel $t3,$a3,$t3,ne 1275 cmp $in2infty,#0 // !$in2intfy, remember? 1276 ldp $a2,$a3,[sp,#$res_x+$i+48] 1277 csel $acc0,$t0,$acc0,ne 1278 csel $acc1,$t1,$acc1,ne 1279 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1280 csel $acc2,$t2,$acc2,ne 1281 csel $acc3,$t3,$acc3,ne 1282 ldp $t2,$t3,[$bp_real,#$i+48] 1283 stp $acc0,$acc1,[$rp_real,#$i] 1284 stp $acc2,$acc3,[$rp_real,#$i+16] 1285___ 1286$code.=<<___ if ($i == 0); 1287 adr $bp_real,.Lone_mont-64 1288___ 1289} 1290$code.=<<___; 1291 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1292 cmp $in1infty,#0 // !$in1intfy, remember? 1293 ldp $acc2,$acc3,[$ap_real,#$i+16] 1294 csel $t0,$a0,$t0,ne 1295 csel $t1,$a1,$t1,ne 1296 csel $t2,$a2,$t2,ne 1297 csel $t3,$a3,$t3,ne 1298 cmp $in2infty,#0 // !$in2intfy, remember? 1299 csel $acc0,$t0,$acc0,ne 1300 csel $acc1,$t1,$acc1,ne 1301 csel $acc2,$t2,$acc2,ne 1302 csel $acc3,$t3,$acc3,ne 1303 stp $acc0,$acc1,[$rp_real,#$i] 1304 stp $acc2,$acc3,[$rp_real,#$i+16] 1305 1306 add sp,x29,#0 // destroy frame 1307 ldp x19,x20,[x29,#16] 1308 ldp x21,x22,[x29,#32] 1309 ldp x23,x24,[x29,#48] 1310 ldp x25,x26,[x29,#64] 1311 ldp x29,x30,[sp],#80 1312 ret 1313.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1314___ 1315} 1316if (1) { 1317my ($ord0,$ord1) = ($poly1,$poly3); 1318my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1319my $acc7 = $bi; 1320 1321$code.=<<___; 1322//////////////////////////////////////////////////////////////////////// 1323// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1324// uint64_t b[4]); 1325.globl ecp_nistz256_ord_mul_mont 1326.type ecp_nistz256_ord_mul_mont,%function 1327.align 4 1328ecp_nistz256_ord_mul_mont: 1329 stp x29,x30,[sp,#-64]! 1330 add x29,sp,#0 1331 stp x19,x20,[sp,#16] 1332 stp x21,x22,[sp,#32] 1333 stp x23,x24,[sp,#48] 1334 1335 adr $ordk,.Lord 1336 ldr $bi,[$bp] // bp[0] 1337 ldp $a0,$a1,[$ap] 1338 ldp $a2,$a3,[$ap,#16] 1339 1340 ldp $ord0,$ord1,[$ordk,#0] 1341 ldp $ord2,$ord3,[$ordk,#16] 1342 ldr $ordk,[$ordk,#32] 1343 1344 mul $acc0,$a0,$bi // a[0]*b[0] 1345 umulh $t0,$a0,$bi 1346 1347 mul $acc1,$a1,$bi // a[1]*b[0] 1348 umulh $t1,$a1,$bi 1349 1350 mul $acc2,$a2,$bi // a[2]*b[0] 1351 umulh $t2,$a2,$bi 1352 1353 mul $acc3,$a3,$bi // a[3]*b[0] 1354 umulh $acc4,$a3,$bi 1355 1356 mul $t4,$acc0,$ordk 1357 1358 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1359 adcs $acc2,$acc2,$t1 1360 adcs $acc3,$acc3,$t2 1361 adc $acc4,$acc4,xzr 1362 mov $acc5,xzr 1363___ 1364for ($i=1;$i<4;$i++) { 1365 ################################################################ 1366 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1367 # * abcdefgh 1368 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1369 # 1370 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1371 # rewrite above as: 1372 # 1373 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1374 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1375 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1376$code.=<<___; 1377 ldr $bi,[$bp,#8*$i] // b[i] 1378 1379 lsl $t0,$t4,#32 1380 subs $acc2,$acc2,$t4 1381 lsr $t1,$t4,#32 1382 sbcs $acc3,$acc3,$t0 1383 sbcs $acc4,$acc4,$t1 1384 sbc $acc5,$acc5,xzr 1385 1386 subs xzr,$acc0,#1 1387 umulh $t1,$ord0,$t4 1388 mul $t2,$ord1,$t4 1389 umulh $t3,$ord1,$t4 1390 1391 adcs $t2,$t2,$t1 1392 mul $t0,$a0,$bi 1393 adc $t3,$t3,xzr 1394 mul $t1,$a1,$bi 1395 1396 adds $acc0,$acc1,$t2 1397 mul $t2,$a2,$bi 1398 adcs $acc1,$acc2,$t3 1399 mul $t3,$a3,$bi 1400 adcs $acc2,$acc3,$t4 1401 adcs $acc3,$acc4,$t4 1402 adc $acc4,$acc5,xzr 1403 1404 adds $acc0,$acc0,$t0 // accumulate low parts 1405 umulh $t0,$a0,$bi 1406 adcs $acc1,$acc1,$t1 1407 umulh $t1,$a1,$bi 1408 adcs $acc2,$acc2,$t2 1409 umulh $t2,$a2,$bi 1410 adcs $acc3,$acc3,$t3 1411 umulh $t3,$a3,$bi 1412 adc $acc4,$acc4,xzr 1413 mul $t4,$acc0,$ordk 1414 adds $acc1,$acc1,$t0 // accumulate high parts 1415 adcs $acc2,$acc2,$t1 1416 adcs $acc3,$acc3,$t2 1417 adcs $acc4,$acc4,$t3 1418 adc $acc5,xzr,xzr 1419___ 1420} 1421$code.=<<___; 1422 lsl $t0,$t4,#32 // last reduction 1423 subs $acc2,$acc2,$t4 1424 lsr $t1,$t4,#32 1425 sbcs $acc3,$acc3,$t0 1426 sbcs $acc4,$acc4,$t1 1427 sbc $acc5,$acc5,xzr 1428 1429 subs xzr,$acc0,#1 1430 umulh $t1,$ord0,$t4 1431 mul $t2,$ord1,$t4 1432 umulh $t3,$ord1,$t4 1433 1434 adcs $t2,$t2,$t1 1435 adc $t3,$t3,xzr 1436 1437 adds $acc0,$acc1,$t2 1438 adcs $acc1,$acc2,$t3 1439 adcs $acc2,$acc3,$t4 1440 adcs $acc3,$acc4,$t4 1441 adc $acc4,$acc5,xzr 1442 1443 subs $t0,$acc0,$ord0 // ret -= modulus 1444 sbcs $t1,$acc1,$ord1 1445 sbcs $t2,$acc2,$ord2 1446 sbcs $t3,$acc3,$ord3 1447 sbcs xzr,$acc4,xzr 1448 1449 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1450 csel $acc1,$acc1,$t1,lo 1451 csel $acc2,$acc2,$t2,lo 1452 stp $acc0,$acc1,[$rp] 1453 csel $acc3,$acc3,$t3,lo 1454 stp $acc2,$acc3,[$rp,#16] 1455 1456 ldp x19,x20,[sp,#16] 1457 ldp x21,x22,[sp,#32] 1458 ldp x23,x24,[sp,#48] 1459 ldr x29,[sp],#64 1460 ret 1461.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1462 1463//////////////////////////////////////////////////////////////////////// 1464// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1465// int rep); 1466.globl ecp_nistz256_ord_sqr_mont 1467.type ecp_nistz256_ord_sqr_mont,%function 1468.align 4 1469ecp_nistz256_ord_sqr_mont: 1470 stp x29,x30,[sp,#-64]! 1471 add x29,sp,#0 1472 stp x19,x20,[sp,#16] 1473 stp x21,x22,[sp,#32] 1474 stp x23,x24,[sp,#48] 1475 1476 adr $ordk,.Lord 1477 ldp $a0,$a1,[$ap] 1478 ldp $a2,$a3,[$ap,#16] 1479 1480 ldp $ord0,$ord1,[$ordk,#0] 1481 ldp $ord2,$ord3,[$ordk,#16] 1482 ldr $ordk,[$ordk,#32] 1483 b .Loop_ord_sqr 1484 1485.align 4 1486.Loop_ord_sqr: 1487 sub $bp,$bp,#1 1488 //////////////////////////////////////////////////////////////// 1489 // | | | | | |a1*a0| | 1490 // | | | | |a2*a0| | | 1491 // | |a3*a2|a3*a0| | | | 1492 // | | | |a2*a1| | | | 1493 // | | |a3*a1| | | | | 1494 // *| | | | | | | | 2| 1495 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1496 // |--+--+--+--+--+--+--+--| 1497 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1498 // 1499 // "can't overflow" below mark carrying into high part of 1500 // multiplication result, which can't overflow, because it 1501 // can never be all ones. 1502 1503 mul $acc1,$a1,$a0 // a[1]*a[0] 1504 umulh $t1,$a1,$a0 1505 mul $acc2,$a2,$a0 // a[2]*a[0] 1506 umulh $t2,$a2,$a0 1507 mul $acc3,$a3,$a0 // a[3]*a[0] 1508 umulh $acc4,$a3,$a0 1509 1510 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1511 mul $t0,$a2,$a1 // a[2]*a[1] 1512 umulh $t1,$a2,$a1 1513 adcs $acc3,$acc3,$t2 1514 mul $t2,$a3,$a1 // a[3]*a[1] 1515 umulh $t3,$a3,$a1 1516 adc $acc4,$acc4,xzr // can't overflow 1517 1518 mul $acc5,$a3,$a2 // a[3]*a[2] 1519 umulh $acc6,$a3,$a2 1520 1521 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1522 mul $acc0,$a0,$a0 // a[0]*a[0] 1523 adc $t2,$t3,xzr // can't overflow 1524 1525 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1526 umulh $a0,$a0,$a0 1527 adcs $acc4,$acc4,$t1 1528 mul $t1,$a1,$a1 // a[1]*a[1] 1529 adcs $acc5,$acc5,$t2 1530 umulh $a1,$a1,$a1 1531 adc $acc6,$acc6,xzr // can't overflow 1532 1533 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1534 mul $t2,$a2,$a2 // a[2]*a[2] 1535 adcs $acc2,$acc2,$acc2 1536 umulh $a2,$a2,$a2 1537 adcs $acc3,$acc3,$acc3 1538 mul $t3,$a3,$a3 // a[3]*a[3] 1539 adcs $acc4,$acc4,$acc4 1540 umulh $a3,$a3,$a3 1541 adcs $acc5,$acc5,$acc5 1542 adcs $acc6,$acc6,$acc6 1543 adc $acc7,xzr,xzr 1544 1545 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1546 mul $t4,$acc0,$ordk 1547 adcs $acc2,$acc2,$t1 1548 adcs $acc3,$acc3,$a1 1549 adcs $acc4,$acc4,$t2 1550 adcs $acc5,$acc5,$a2 1551 adcs $acc6,$acc6,$t3 1552 adc $acc7,$acc7,$a3 1553___ 1554for($i=0; $i<4; $i++) { # reductions 1555$code.=<<___; 1556 subs xzr,$acc0,#1 1557 umulh $t1,$ord0,$t4 1558 mul $t2,$ord1,$t4 1559 umulh $t3,$ord1,$t4 1560 1561 adcs $t2,$t2,$t1 1562 adc $t3,$t3,xzr 1563 1564 adds $acc0,$acc1,$t2 1565 adcs $acc1,$acc2,$t3 1566 adcs $acc2,$acc3,$t4 1567 adc $acc3,xzr,$t4 // can't overflow 1568___ 1569$code.=<<___ if ($i<3); 1570 mul $t3,$acc0,$ordk 1571___ 1572$code.=<<___; 1573 lsl $t0,$t4,#32 1574 subs $acc1,$acc1,$t4 1575 lsr $t1,$t4,#32 1576 sbcs $acc2,$acc2,$t0 1577 sbc $acc3,$acc3,$t1 // can't borrow 1578___ 1579 ($t3,$t4) = ($t4,$t3); 1580} 1581$code.=<<___; 1582 adds $acc0,$acc0,$acc4 // accumulate upper half 1583 adcs $acc1,$acc1,$acc5 1584 adcs $acc2,$acc2,$acc6 1585 adcs $acc3,$acc3,$acc7 1586 adc $acc4,xzr,xzr 1587 1588 subs $t0,$acc0,$ord0 // ret -= modulus 1589 sbcs $t1,$acc1,$ord1 1590 sbcs $t2,$acc2,$ord2 1591 sbcs $t3,$acc3,$ord3 1592 sbcs xzr,$acc4,xzr 1593 1594 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1595 csel $a1,$acc1,$t1,lo 1596 csel $a2,$acc2,$t2,lo 1597 csel $a3,$acc3,$t3,lo 1598 1599 cbnz $bp,.Loop_ord_sqr 1600 1601 stp $a0,$a1,[$rp] 1602 stp $a2,$a3,[$rp,#16] 1603 1604 ldp x19,x20,[sp,#16] 1605 ldp x21,x22,[sp,#32] 1606 ldp x23,x24,[sp,#48] 1607 ldr x29,[sp],#64 1608 ret 1609.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1610___ 1611} } 1612 1613######################################################################## 1614# scatter-gather subroutines 1615{ 1616my ($out,$inp,$index,$mask)=map("x$_",(0..3)); 1617$code.=<<___; 1618// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, 1619// int x2); 1620.globl ecp_nistz256_scatter_w5 1621.type ecp_nistz256_scatter_w5,%function 1622.align 4 1623ecp_nistz256_scatter_w5: 1624 stp x29,x30,[sp,#-16]! 1625 add x29,sp,#0 1626 1627 add $out,$out,$index,lsl#2 1628 1629 ldp x4,x5,[$inp] // X 1630 ldp x6,x7,[$inp,#16] 1631 str w4,[$out,#64*0-4] 1632 lsr x4,x4,#32 1633 str w5,[$out,#64*1-4] 1634 lsr x5,x5,#32 1635 str w6,[$out,#64*2-4] 1636 lsr x6,x6,#32 1637 str w7,[$out,#64*3-4] 1638 lsr x7,x7,#32 1639 str w4,[$out,#64*4-4] 1640 str w5,[$out,#64*5-4] 1641 str w6,[$out,#64*6-4] 1642 str w7,[$out,#64*7-4] 1643 add $out,$out,#64*8 1644 1645 ldp x4,x5,[$inp,#32] // Y 1646 ldp x6,x7,[$inp,#48] 1647 str w4,[$out,#64*0-4] 1648 lsr x4,x4,#32 1649 str w5,[$out,#64*1-4] 1650 lsr x5,x5,#32 1651 str w6,[$out,#64*2-4] 1652 lsr x6,x6,#32 1653 str w7,[$out,#64*3-4] 1654 lsr x7,x7,#32 1655 str w4,[$out,#64*4-4] 1656 str w5,[$out,#64*5-4] 1657 str w6,[$out,#64*6-4] 1658 str w7,[$out,#64*7-4] 1659 add $out,$out,#64*8 1660 1661 ldp x4,x5,[$inp,#64] // Z 1662 ldp x6,x7,[$inp,#80] 1663 str w4,[$out,#64*0-4] 1664 lsr x4,x4,#32 1665 str w5,[$out,#64*1-4] 1666 lsr x5,x5,#32 1667 str w6,[$out,#64*2-4] 1668 lsr x6,x6,#32 1669 str w7,[$out,#64*3-4] 1670 lsr x7,x7,#32 1671 str w4,[$out,#64*4-4] 1672 str w5,[$out,#64*5-4] 1673 str w6,[$out,#64*6-4] 1674 str w7,[$out,#64*7-4] 1675 1676 ldr x29,[sp],#16 1677 ret 1678.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1679 1680// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, 1681// int x2); 1682.globl ecp_nistz256_gather_w5 1683.type ecp_nistz256_gather_w5,%function 1684.align 4 1685ecp_nistz256_gather_w5: 1686 stp x29,x30,[sp,#-16]! 1687 add x29,sp,#0 1688 1689 cmp $index,xzr 1690 csetm x3,ne 1691 add $index,$index,x3 1692 add $inp,$inp,$index,lsl#2 1693 1694 ldr w4,[$inp,#64*0] 1695 ldr w5,[$inp,#64*1] 1696 ldr w6,[$inp,#64*2] 1697 ldr w7,[$inp,#64*3] 1698 ldr w8,[$inp,#64*4] 1699 ldr w9,[$inp,#64*5] 1700 ldr w10,[$inp,#64*6] 1701 ldr w11,[$inp,#64*7] 1702 add $inp,$inp,#64*8 1703 orr x4,x4,x8,lsl#32 1704 orr x5,x5,x9,lsl#32 1705 orr x6,x6,x10,lsl#32 1706 orr x7,x7,x11,lsl#32 1707 csel x4,x4,xzr,ne 1708 csel x5,x5,xzr,ne 1709 csel x6,x6,xzr,ne 1710 csel x7,x7,xzr,ne 1711 stp x4,x5,[$out] // X 1712 stp x6,x7,[$out,#16] 1713 1714 ldr w4,[$inp,#64*0] 1715 ldr w5,[$inp,#64*1] 1716 ldr w6,[$inp,#64*2] 1717 ldr w7,[$inp,#64*3] 1718 ldr w8,[$inp,#64*4] 1719 ldr w9,[$inp,#64*5] 1720 ldr w10,[$inp,#64*6] 1721 ldr w11,[$inp,#64*7] 1722 add $inp,$inp,#64*8 1723 orr x4,x4,x8,lsl#32 1724 orr x5,x5,x9,lsl#32 1725 orr x6,x6,x10,lsl#32 1726 orr x7,x7,x11,lsl#32 1727 csel x4,x4,xzr,ne 1728 csel x5,x5,xzr,ne 1729 csel x6,x6,xzr,ne 1730 csel x7,x7,xzr,ne 1731 stp x4,x5,[$out,#32] // Y 1732 stp x6,x7,[$out,#48] 1733 1734 ldr w4,[$inp,#64*0] 1735 ldr w5,[$inp,#64*1] 1736 ldr w6,[$inp,#64*2] 1737 ldr w7,[$inp,#64*3] 1738 ldr w8,[$inp,#64*4] 1739 ldr w9,[$inp,#64*5] 1740 ldr w10,[$inp,#64*6] 1741 ldr w11,[$inp,#64*7] 1742 orr x4,x4,x8,lsl#32 1743 orr x5,x5,x9,lsl#32 1744 orr x6,x6,x10,lsl#32 1745 orr x7,x7,x11,lsl#32 1746 csel x4,x4,xzr,ne 1747 csel x5,x5,xzr,ne 1748 csel x6,x6,xzr,ne 1749 csel x7,x7,xzr,ne 1750 stp x4,x5,[$out,#64] // Z 1751 stp x6,x7,[$out,#80] 1752 1753 ldr x29,[sp],#16 1754 ret 1755.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1756 1757// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, 1758// int x2); 1759.globl ecp_nistz256_scatter_w7 1760.type ecp_nistz256_scatter_w7,%function 1761.align 4 1762ecp_nistz256_scatter_w7: 1763 stp x29,x30,[sp,#-16]! 1764 add x29,sp,#0 1765 1766 add $out,$out,$index 1767 mov $index,#64/8 1768.Loop_scatter_w7: 1769 ldr x3,[$inp],#8 1770 subs $index,$index,#1 1771 prfm pstl1strm,[$out,#4096+64*0] 1772 prfm pstl1strm,[$out,#4096+64*1] 1773 prfm pstl1strm,[$out,#4096+64*2] 1774 prfm pstl1strm,[$out,#4096+64*3] 1775 prfm pstl1strm,[$out,#4096+64*4] 1776 prfm pstl1strm,[$out,#4096+64*5] 1777 prfm pstl1strm,[$out,#4096+64*6] 1778 prfm pstl1strm,[$out,#4096+64*7] 1779 strb w3,[$out,#64*0] 1780 lsr x3,x3,#8 1781 strb w3,[$out,#64*1] 1782 lsr x3,x3,#8 1783 strb w3,[$out,#64*2] 1784 lsr x3,x3,#8 1785 strb w3,[$out,#64*3] 1786 lsr x3,x3,#8 1787 strb w3,[$out,#64*4] 1788 lsr x3,x3,#8 1789 strb w3,[$out,#64*5] 1790 lsr x3,x3,#8 1791 strb w3,[$out,#64*6] 1792 lsr x3,x3,#8 1793 strb w3,[$out,#64*7] 1794 add $out,$out,#64*8 1795 b.ne .Loop_scatter_w7 1796 1797 ldr x29,[sp],#16 1798 ret 1799.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1800 1801// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, 1802// int x2); 1803.globl ecp_nistz256_gather_w7 1804.type ecp_nistz256_gather_w7,%function 1805.align 4 1806ecp_nistz256_gather_w7: 1807 stp x29,x30,[sp,#-16]! 1808 add x29,sp,#0 1809 1810 cmp $index,xzr 1811 csetm x3,ne 1812 add $index,$index,x3 1813 add $inp,$inp,$index 1814 mov $index,#64/8 1815 nop 1816.Loop_gather_w7: 1817 ldrb w4,[$inp,#64*0] 1818 prfm pldl1strm,[$inp,#4096+64*0] 1819 subs $index,$index,#1 1820 ldrb w5,[$inp,#64*1] 1821 prfm pldl1strm,[$inp,#4096+64*1] 1822 ldrb w6,[$inp,#64*2] 1823 prfm pldl1strm,[$inp,#4096+64*2] 1824 ldrb w7,[$inp,#64*3] 1825 prfm pldl1strm,[$inp,#4096+64*3] 1826 ldrb w8,[$inp,#64*4] 1827 prfm pldl1strm,[$inp,#4096+64*4] 1828 ldrb w9,[$inp,#64*5] 1829 prfm pldl1strm,[$inp,#4096+64*5] 1830 ldrb w10,[$inp,#64*6] 1831 prfm pldl1strm,[$inp,#4096+64*6] 1832 ldrb w11,[$inp,#64*7] 1833 prfm pldl1strm,[$inp,#4096+64*7] 1834 add $inp,$inp,#64*8 1835 orr x4,x4,x5,lsl#8 1836 orr x6,x6,x7,lsl#8 1837 orr x8,x8,x9,lsl#8 1838 orr x4,x4,x6,lsl#16 1839 orr x10,x10,x11,lsl#8 1840 orr x4,x4,x8,lsl#32 1841 orr x4,x4,x10,lsl#48 1842 and x4,x4,x3 1843 str x4,[$out],#8 1844 b.ne .Loop_gather_w7 1845 1846 ldr x29,[sp],#16 1847 ret 1848.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1849___ 1850} 1851 1852foreach (split("\n",$code)) { 1853 s/\`([^\`]*)\`/eval $1/ge; 1854 1855 print $_,"\n"; 1856} 1857close STDOUT; # enforce flush 1858