1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34$flavour = shift; 35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 40die "can't locate arm-xlate.pl"; 41 42open OUT,"| \"$^X\" $xlate $flavour $output"; 43*STDOUT=*OUT; 44 45{ 46my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 48 map("x$_",(0..17,19,20)); 49 50my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 51 52$code.=<<___; 53#include "arm_arch.h" 54 55.text 56___ 57######################################################################## 58# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 59# 60$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 61open TABLE,"<ecp_nistz256_table.c" or 62open TABLE,"<${dir}../ecp_nistz256_table.c" or 63die "failed to open ecp_nistz256_table.c:",$!; 64 65use integer; 66 67foreach(<TABLE>) { 68 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 69} 70close TABLE; 71 72# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 73# 64*16*37-1 is because $#arr returns last valid index or @arr, not 74# amount of elements. 75die "insane number of elements" if ($#arr != 64*16*37-1); 76 77$code.=<<___; 78.globl ecp_nistz256_precomputed 79.type ecp_nistz256_precomputed,%object 80.align 12 81ecp_nistz256_precomputed: 82___ 83######################################################################## 84# this conversion smashes P256_POINT_AFFINE by individual bytes with 85# 64 byte interval, similar to 86# 1111222233334444 87# 1234123412341234 88for(1..37) { 89 @tbl = splice(@arr,0,64*16); 90 for($i=0;$i<64;$i++) { 91 undef @line; 92 for($j=0;$j<64;$j++) { 93 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 94 } 95 $code.=".byte\t"; 96 $code.=join(',',map { sprintf "0x%02x",$_} @line); 97 $code.="\n"; 98 } 99} 100$code.=<<___; 101.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 102.align 5 103.Lpoly: 104.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 105.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 106.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 107.Lone_mont: 108.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 109.Lone: 110.quad 1,0,0,0 111.Lord: 112.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 113.LordK: 114.quad 0xccd1c8aaee00bc4f 115.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 116 117// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 118.globl ecp_nistz256_to_mont 119.type ecp_nistz256_to_mont,%function 120.align 6 121ecp_nistz256_to_mont: 122 .inst 0xd503233f // paciasp 123 stp x29,x30,[sp,#-32]! 124 add x29,sp,#0 125 stp x19,x20,[sp,#16] 126 127 ldr $bi,.LRR // bp[0] 128 ldp $a0,$a1,[$ap] 129 ldp $a2,$a3,[$ap,#16] 130 ldr $poly1,.Lpoly+8 131 ldr $poly3,.Lpoly+24 132 adr $bp,.LRR // &bp[0] 133 134 bl __ecp_nistz256_mul_mont 135 136 ldp x19,x20,[sp,#16] 137 ldp x29,x30,[sp],#32 138 .inst 0xd50323bf // autiasp 139 ret 140.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 141 142// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 143.globl ecp_nistz256_from_mont 144.type ecp_nistz256_from_mont,%function 145.align 4 146ecp_nistz256_from_mont: 147 .inst 0xd503233f // paciasp 148 stp x29,x30,[sp,#-32]! 149 add x29,sp,#0 150 stp x19,x20,[sp,#16] 151 152 mov $bi,#1 // bp[0] 153 ldp $a0,$a1,[$ap] 154 ldp $a2,$a3,[$ap,#16] 155 ldr $poly1,.Lpoly+8 156 ldr $poly3,.Lpoly+24 157 adr $bp,.Lone // &bp[0] 158 159 bl __ecp_nistz256_mul_mont 160 161 ldp x19,x20,[sp,#16] 162 ldp x29,x30,[sp],#32 163 .inst 0xd50323bf // autiasp 164 ret 165.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 166 167// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 168// const BN_ULONG x2[4]); 169.globl ecp_nistz256_mul_mont 170.type ecp_nistz256_mul_mont,%function 171.align 4 172ecp_nistz256_mul_mont: 173 .inst 0xd503233f // paciasp 174 stp x29,x30,[sp,#-32]! 175 add x29,sp,#0 176 stp x19,x20,[sp,#16] 177 178 ldr $bi,[$bp] // bp[0] 179 ldp $a0,$a1,[$ap] 180 ldp $a2,$a3,[$ap,#16] 181 ldr $poly1,.Lpoly+8 182 ldr $poly3,.Lpoly+24 183 184 bl __ecp_nistz256_mul_mont 185 186 ldp x19,x20,[sp,#16] 187 ldp x29,x30,[sp],#32 188 .inst 0xd50323bf // autiasp 189 ret 190.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 191 192// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 193.globl ecp_nistz256_sqr_mont 194.type ecp_nistz256_sqr_mont,%function 195.align 4 196ecp_nistz256_sqr_mont: 197 .inst 0xd503233f // paciasp 198 stp x29,x30,[sp,#-32]! 199 add x29,sp,#0 200 stp x19,x20,[sp,#16] 201 202 ldp $a0,$a1,[$ap] 203 ldp $a2,$a3,[$ap,#16] 204 ldr $poly1,.Lpoly+8 205 ldr $poly3,.Lpoly+24 206 207 bl __ecp_nistz256_sqr_mont 208 209 ldp x19,x20,[sp,#16] 210 ldp x29,x30,[sp],#32 211 .inst 0xd50323bf // autiasp 212 ret 213.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 214 215// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 216// const BN_ULONG x2[4]); 217.globl ecp_nistz256_add 218.type ecp_nistz256_add,%function 219.align 4 220ecp_nistz256_add: 221 .inst 0xd503233f // paciasp 222 stp x29,x30,[sp,#-16]! 223 add x29,sp,#0 224 225 ldp $acc0,$acc1,[$ap] 226 ldp $t0,$t1,[$bp] 227 ldp $acc2,$acc3,[$ap,#16] 228 ldp $t2,$t3,[$bp,#16] 229 ldr $poly1,.Lpoly+8 230 ldr $poly3,.Lpoly+24 231 232 bl __ecp_nistz256_add 233 234 ldp x29,x30,[sp],#16 235 .inst 0xd50323bf // autiasp 236 ret 237.size ecp_nistz256_add,.-ecp_nistz256_add 238 239// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 240.globl ecp_nistz256_div_by_2 241.type ecp_nistz256_div_by_2,%function 242.align 4 243ecp_nistz256_div_by_2: 244 .inst 0xd503233f // paciasp 245 stp x29,x30,[sp,#-16]! 246 add x29,sp,#0 247 248 ldp $acc0,$acc1,[$ap] 249 ldp $acc2,$acc3,[$ap,#16] 250 ldr $poly1,.Lpoly+8 251 ldr $poly3,.Lpoly+24 252 253 bl __ecp_nistz256_div_by_2 254 255 ldp x29,x30,[sp],#16 256 .inst 0xd50323bf // autiasp 257 ret 258.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 259 260// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 261.globl ecp_nistz256_mul_by_2 262.type ecp_nistz256_mul_by_2,%function 263.align 4 264ecp_nistz256_mul_by_2: 265 .inst 0xd503233f // paciasp 266 stp x29,x30,[sp,#-16]! 267 add x29,sp,#0 268 269 ldp $acc0,$acc1,[$ap] 270 ldp $acc2,$acc3,[$ap,#16] 271 ldr $poly1,.Lpoly+8 272 ldr $poly3,.Lpoly+24 273 mov $t0,$acc0 274 mov $t1,$acc1 275 mov $t2,$acc2 276 mov $t3,$acc3 277 278 bl __ecp_nistz256_add // ret = a+a // 2*a 279 280 ldp x29,x30,[sp],#16 281 .inst 0xd50323bf // autiasp 282 ret 283.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 284 285// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 286.globl ecp_nistz256_mul_by_3 287.type ecp_nistz256_mul_by_3,%function 288.align 4 289ecp_nistz256_mul_by_3: 290 .inst 0xd503233f // paciasp 291 stp x29,x30,[sp,#-16]! 292 add x29,sp,#0 293 294 ldp $acc0,$acc1,[$ap] 295 ldp $acc2,$acc3,[$ap,#16] 296 ldr $poly1,.Lpoly+8 297 ldr $poly3,.Lpoly+24 298 mov $t0,$acc0 299 mov $t1,$acc1 300 mov $t2,$acc2 301 mov $t3,$acc3 302 mov $a0,$acc0 303 mov $a1,$acc1 304 mov $a2,$acc2 305 mov $a3,$acc3 306 307 bl __ecp_nistz256_add // ret = a+a // 2*a 308 309 mov $t0,$a0 310 mov $t1,$a1 311 mov $t2,$a2 312 mov $t3,$a3 313 314 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a 315 316 ldp x29,x30,[sp],#16 317 .inst 0xd50323bf // autiasp 318 ret 319.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 320 321// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 322// const BN_ULONG x2[4]); 323.globl ecp_nistz256_sub 324.type ecp_nistz256_sub,%function 325.align 4 326ecp_nistz256_sub: 327 .inst 0xd503233f // paciasp 328 stp x29,x30,[sp,#-16]! 329 add x29,sp,#0 330 331 ldp $acc0,$acc1,[$ap] 332 ldp $acc2,$acc3,[$ap,#16] 333 ldr $poly1,.Lpoly+8 334 ldr $poly3,.Lpoly+24 335 336 bl __ecp_nistz256_sub_from 337 338 ldp x29,x30,[sp],#16 339 .inst 0xd50323bf // autiasp 340 ret 341.size ecp_nistz256_sub,.-ecp_nistz256_sub 342 343// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 344.globl ecp_nistz256_neg 345.type ecp_nistz256_neg,%function 346.align 4 347ecp_nistz256_neg: 348 .inst 0xd503233f // paciasp 349 stp x29,x30,[sp,#-16]! 350 add x29,sp,#0 351 352 mov $bp,$ap 353 mov $acc0,xzr // a = 0 354 mov $acc1,xzr 355 mov $acc2,xzr 356 mov $acc3,xzr 357 ldr $poly1,.Lpoly+8 358 ldr $poly3,.Lpoly+24 359 360 bl __ecp_nistz256_sub_from 361 362 ldp x29,x30,[sp],#16 363 .inst 0xd50323bf // autiasp 364 ret 365.size ecp_nistz256_neg,.-ecp_nistz256_neg 366 367// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 368// to $a0-$a3 and b[0] - to $bi 369.type __ecp_nistz256_mul_mont,%function 370.align 4 371__ecp_nistz256_mul_mont: 372 mul $acc0,$a0,$bi // a[0]*b[0] 373 umulh $t0,$a0,$bi 374 375 mul $acc1,$a1,$bi // a[1]*b[0] 376 umulh $t1,$a1,$bi 377 378 mul $acc2,$a2,$bi // a[2]*b[0] 379 umulh $t2,$a2,$bi 380 381 mul $acc3,$a3,$bi // a[3]*b[0] 382 umulh $t3,$a3,$bi 383 ldr $bi,[$bp,#8] // b[1] 384 385 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 386 lsl $t0,$acc0,#32 387 adcs $acc2,$acc2,$t1 388 lsr $t1,$acc0,#32 389 adcs $acc3,$acc3,$t2 390 adc $acc4,xzr,$t3 391 mov $acc5,xzr 392___ 393for($i=1;$i<4;$i++) { 394 # Reduction iteration is normally performed by accumulating 395 # result of multiplication of modulus by "magic" digit [and 396 # omitting least significant word, which is guaranteed to 397 # be 0], but thanks to special form of modulus and "magic" 398 # digit being equal to least significant word, it can be 399 # performed with additions and subtractions alone. Indeed: 400 # 401 # ffff0001.00000000.0000ffff.ffffffff 402 # * abcdefgh 403 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 404 # 405 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 406 # rewrite above as: 407 # 408 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 409 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 410 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 411 # 412 # or marking redundant operations: 413 # 414 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 415 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 416 # - 0000abcd.efgh0000.--------.--------.-------- 417 418$code.=<<___; 419 subs $t2,$acc0,$t0 // "*0xffff0001" 420 sbc $t3,$acc0,$t1 421 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 422 mul $t0,$a0,$bi // lo(a[0]*b[i]) 423 adcs $acc1,$acc2,$t1 424 mul $t1,$a1,$bi // lo(a[1]*b[i]) 425 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 426 mul $t2,$a2,$bi // lo(a[2]*b[i]) 427 adcs $acc3,$acc4,$t3 428 mul $t3,$a3,$bi // lo(a[3]*b[i]) 429 adc $acc4,$acc5,xzr 430 431 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 432 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 433 adcs $acc1,$acc1,$t1 434 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 435 adcs $acc2,$acc2,$t2 436 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 437 adcs $acc3,$acc3,$t3 438 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 439 adc $acc4,$acc4,xzr 440___ 441$code.=<<___ if ($i<3); 442 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 443___ 444$code.=<<___; 445 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 446 lsl $t0,$acc0,#32 447 adcs $acc2,$acc2,$t1 448 lsr $t1,$acc0,#32 449 adcs $acc3,$acc3,$t2 450 adcs $acc4,$acc4,$t3 451 adc $acc5,xzr,xzr 452___ 453} 454$code.=<<___; 455 // last reduction 456 subs $t2,$acc0,$t0 // "*0xffff0001" 457 sbc $t3,$acc0,$t1 458 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 459 adcs $acc1,$acc2,$t1 460 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 461 adcs $acc3,$acc4,$t3 462 adc $acc4,$acc5,xzr 463 464 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 465 sbcs $t1,$acc1,$poly1 466 sbcs $t2,$acc2,xzr 467 sbcs $t3,$acc3,$poly3 468 sbcs xzr,$acc4,xzr // did it borrow? 469 470 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 471 csel $acc1,$acc1,$t1,lo 472 csel $acc2,$acc2,$t2,lo 473 stp $acc0,$acc1,[$rp] 474 csel $acc3,$acc3,$t3,lo 475 stp $acc2,$acc3,[$rp,#16] 476 477 ret 478.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 479 480// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 481// to $a0-$a3 482.type __ecp_nistz256_sqr_mont,%function 483.align 4 484__ecp_nistz256_sqr_mont: 485 // | | | | | |a1*a0| | 486 // | | | | |a2*a0| | | 487 // | |a3*a2|a3*a0| | | | 488 // | | | |a2*a1| | | | 489 // | | |a3*a1| | | | | 490 // *| | | | | | | | 2| 491 // +|a3*a3|a2*a2|a1*a1|a0*a0| 492 // |--+--+--+--+--+--+--+--| 493 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 494 // 495 // "can't overflow" below mark carrying into high part of 496 // multiplication result, which can't overflow, because it 497 // can never be all ones. 498 499 mul $acc1,$a1,$a0 // a[1]*a[0] 500 umulh $t1,$a1,$a0 501 mul $acc2,$a2,$a0 // a[2]*a[0] 502 umulh $t2,$a2,$a0 503 mul $acc3,$a3,$a0 // a[3]*a[0] 504 umulh $acc4,$a3,$a0 505 506 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 507 mul $t0,$a2,$a1 // a[2]*a[1] 508 umulh $t1,$a2,$a1 509 adcs $acc3,$acc3,$t2 510 mul $t2,$a3,$a1 // a[3]*a[1] 511 umulh $t3,$a3,$a1 512 adc $acc4,$acc4,xzr // can't overflow 513 514 mul $acc5,$a3,$a2 // a[3]*a[2] 515 umulh $acc6,$a3,$a2 516 517 adds $t1,$t1,$t2 // accumulate high parts of multiplication 518 mul $acc0,$a0,$a0 // a[0]*a[0] 519 adc $t2,$t3,xzr // can't overflow 520 521 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 522 umulh $a0,$a0,$a0 523 adcs $acc4,$acc4,$t1 524 mul $t1,$a1,$a1 // a[1]*a[1] 525 adcs $acc5,$acc5,$t2 526 umulh $a1,$a1,$a1 527 adc $acc6,$acc6,xzr // can't overflow 528 529 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 530 mul $t2,$a2,$a2 // a[2]*a[2] 531 adcs $acc2,$acc2,$acc2 532 umulh $a2,$a2,$a2 533 adcs $acc3,$acc3,$acc3 534 mul $t3,$a3,$a3 // a[3]*a[3] 535 adcs $acc4,$acc4,$acc4 536 umulh $a3,$a3,$a3 537 adcs $acc5,$acc5,$acc5 538 adcs $acc6,$acc6,$acc6 539 adc $acc7,xzr,xzr 540 541 adds $acc1,$acc1,$a0 // +a[i]*a[i] 542 adcs $acc2,$acc2,$t1 543 adcs $acc3,$acc3,$a1 544 adcs $acc4,$acc4,$t2 545 adcs $acc5,$acc5,$a2 546 lsl $t0,$acc0,#32 547 adcs $acc6,$acc6,$t3 548 lsr $t1,$acc0,#32 549 adc $acc7,$acc7,$a3 550___ 551for($i=0;$i<3;$i++) { # reductions, see commentary in 552 # multiplication for details 553$code.=<<___; 554 subs $t2,$acc0,$t0 // "*0xffff0001" 555 sbc $t3,$acc0,$t1 556 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 557 adcs $acc1,$acc2,$t1 558 lsl $t0,$acc0,#32 559 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 560 lsr $t1,$acc0,#32 561 adc $acc3,$t3,xzr // can't overflow 562___ 563} 564$code.=<<___; 565 subs $t2,$acc0,$t0 // "*0xffff0001" 566 sbc $t3,$acc0,$t1 567 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 568 adcs $acc1,$acc2,$t1 569 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 570 adc $acc3,$t3,xzr // can't overflow 571 572 adds $acc0,$acc0,$acc4 // accumulate upper half 573 adcs $acc1,$acc1,$acc5 574 adcs $acc2,$acc2,$acc6 575 adcs $acc3,$acc3,$acc7 576 adc $acc4,xzr,xzr 577 578 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 579 sbcs $t1,$acc1,$poly1 580 sbcs $t2,$acc2,xzr 581 sbcs $t3,$acc3,$poly3 582 sbcs xzr,$acc4,xzr // did it borrow? 583 584 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 585 csel $acc1,$acc1,$t1,lo 586 csel $acc2,$acc2,$t2,lo 587 stp $acc0,$acc1,[$rp] 588 csel $acc3,$acc3,$t3,lo 589 stp $acc2,$acc3,[$rp,#16] 590 591 ret 592.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 593 594// Note that __ecp_nistz256_add expects both input vectors pre-loaded to 595// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 596// contexts, e.g. in multiplication by 2 and 3... 597.type __ecp_nistz256_add,%function 598.align 4 599__ecp_nistz256_add: 600 adds $acc0,$acc0,$t0 // ret = a+b 601 adcs $acc1,$acc1,$t1 602 adcs $acc2,$acc2,$t2 603 adcs $acc3,$acc3,$t3 604 adc $ap,xzr,xzr // zap $ap 605 606 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 607 sbcs $t1,$acc1,$poly1 608 sbcs $t2,$acc2,xzr 609 sbcs $t3,$acc3,$poly3 610 sbcs xzr,$ap,xzr // did subtraction borrow? 611 612 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 613 csel $acc1,$acc1,$t1,lo 614 csel $acc2,$acc2,$t2,lo 615 stp $acc0,$acc1,[$rp] 616 csel $acc3,$acc3,$t3,lo 617 stp $acc2,$acc3,[$rp,#16] 618 619 ret 620.size __ecp_nistz256_add,.-__ecp_nistz256_add 621 622.type __ecp_nistz256_sub_from,%function 623.align 4 624__ecp_nistz256_sub_from: 625 ldp $t0,$t1,[$bp] 626 ldp $t2,$t3,[$bp,#16] 627 subs $acc0,$acc0,$t0 // ret = a-b 628 sbcs $acc1,$acc1,$t1 629 sbcs $acc2,$acc2,$t2 630 sbcs $acc3,$acc3,$t3 631 sbc $ap,xzr,xzr // zap $ap 632 633 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 634 adcs $t1,$acc1,$poly1 635 adcs $t2,$acc2,xzr 636 adc $t3,$acc3,$poly3 637 cmp $ap,xzr // did subtraction borrow? 638 639 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 640 csel $acc1,$acc1,$t1,eq 641 csel $acc2,$acc2,$t2,eq 642 stp $acc0,$acc1,[$rp] 643 csel $acc3,$acc3,$t3,eq 644 stp $acc2,$acc3,[$rp,#16] 645 646 ret 647.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 648 649.type __ecp_nistz256_sub_morf,%function 650.align 4 651__ecp_nistz256_sub_morf: 652 ldp $t0,$t1,[$bp] 653 ldp $t2,$t3,[$bp,#16] 654 subs $acc0,$t0,$acc0 // ret = b-a 655 sbcs $acc1,$t1,$acc1 656 sbcs $acc2,$t2,$acc2 657 sbcs $acc3,$t3,$acc3 658 sbc $ap,xzr,xzr // zap $ap 659 660 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 661 adcs $t1,$acc1,$poly1 662 adcs $t2,$acc2,xzr 663 adc $t3,$acc3,$poly3 664 cmp $ap,xzr // did subtraction borrow? 665 666 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 667 csel $acc1,$acc1,$t1,eq 668 csel $acc2,$acc2,$t2,eq 669 stp $acc0,$acc1,[$rp] 670 csel $acc3,$acc3,$t3,eq 671 stp $acc2,$acc3,[$rp,#16] 672 673 ret 674.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 675 676.type __ecp_nistz256_div_by_2,%function 677.align 4 678__ecp_nistz256_div_by_2: 679 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 680 adcs $t1,$acc1,$poly1 681 adcs $t2,$acc2,xzr 682 adcs $t3,$acc3,$poly3 683 adc $ap,xzr,xzr // zap $ap 684 tst $acc0,#1 // is a even? 685 686 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 687 csel $acc1,$acc1,$t1,eq 688 csel $acc2,$acc2,$t2,eq 689 csel $acc3,$acc3,$t3,eq 690 csel $ap,xzr,$ap,eq 691 692 lsr $acc0,$acc0,#1 // ret >>= 1 693 orr $acc0,$acc0,$acc1,lsl#63 694 lsr $acc1,$acc1,#1 695 orr $acc1,$acc1,$acc2,lsl#63 696 lsr $acc2,$acc2,#1 697 orr $acc2,$acc2,$acc3,lsl#63 698 lsr $acc3,$acc3,#1 699 stp $acc0,$acc1,[$rp] 700 orr $acc3,$acc3,$ap,lsl#63 701 stp $acc2,$acc3,[$rp,#16] 702 703 ret 704.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 705___ 706######################################################################## 707# following subroutines are "literal" implementation of those found in 708# ecp_nistz256.c 709# 710######################################################################## 711# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 712# 713{ 714my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 715# above map() describes stack layout with 4 temporary 716# 256-bit vectors on top. 717my ($rp_real,$ap_real) = map("x$_",(21,22)); 718 719$code.=<<___; 720.globl ecp_nistz256_point_double 721.type ecp_nistz256_point_double,%function 722.align 5 723ecp_nistz256_point_double: 724 .inst 0xd503233f // paciasp 725 stp x29,x30,[sp,#-96]! 726 add x29,sp,#0 727 stp x19,x20,[sp,#16] 728 stp x21,x22,[sp,#32] 729 sub sp,sp,#32*4 730 731.Ldouble_shortcut: 732 ldp $acc0,$acc1,[$ap,#32] 733 mov $rp_real,$rp 734 ldp $acc2,$acc3,[$ap,#48] 735 mov $ap_real,$ap 736 ldr $poly1,.Lpoly+8 737 mov $t0,$acc0 738 ldr $poly3,.Lpoly+24 739 mov $t1,$acc1 740 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 741 mov $t2,$acc2 742 mov $t3,$acc3 743 ldp $a2,$a3,[$ap_real,#64+16] 744 add $rp,sp,#$S 745 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); 746 747 add $rp,sp,#$Zsqr 748 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 749 750 ldp $t0,$t1,[$ap_real] 751 ldp $t2,$t3,[$ap_real,#16] 752 mov $a0,$acc0 // put Zsqr aside for p256_sub 753 mov $a1,$acc1 754 mov $a2,$acc2 755 mov $a3,$acc3 756 add $rp,sp,#$M 757 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); 758 759 add $bp,$ap_real,#0 760 mov $acc0,$a0 // restore Zsqr 761 mov $acc1,$a1 762 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 763 mov $acc2,$a2 764 mov $acc3,$a3 765 ldp $a2,$a3,[sp,#$S+16] 766 add $rp,sp,#$Zsqr 767 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 768 769 add $rp,sp,#$S 770 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 771 772 ldr $bi,[$ap_real,#32] 773 ldp $a0,$a1,[$ap_real,#64] 774 ldp $a2,$a3,[$ap_real,#64+16] 775 add $bp,$ap_real,#32 776 add $rp,sp,#$tmp0 777 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 778 779 mov $t0,$acc0 780 mov $t1,$acc1 781 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 782 mov $t2,$acc2 783 mov $t3,$acc3 784 ldp $a2,$a3,[sp,#$S+16] 785 add $rp,$rp_real,#64 786 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); 787 788 add $rp,sp,#$tmp0 789 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 790 791 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 792 ldp $a0,$a1,[sp,#$M] 793 ldp $a2,$a3,[sp,#$M+16] 794 add $rp,$rp_real,#32 795 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 796 797 add $bp,sp,#$Zsqr 798 add $rp,sp,#$M 799 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 800 801 mov $t0,$acc0 // duplicate M 802 mov $t1,$acc1 803 mov $t2,$acc2 804 mov $t3,$acc3 805 mov $a0,$acc0 // put M aside 806 mov $a1,$acc1 807 mov $a2,$acc2 808 mov $a3,$acc3 809 add $rp,sp,#$M 810 bl __ecp_nistz256_add 811 mov $t0,$a0 // restore M 812 mov $t1,$a1 813 ldr $bi,[$ap_real] // forward load for p256_mul_mont 814 mov $t2,$a2 815 ldp $a0,$a1,[sp,#$S] 816 mov $t3,$a3 817 ldp $a2,$a3,[sp,#$S+16] 818 bl __ecp_nistz256_add // p256_mul_by_3(M, M); 819 820 add $bp,$ap_real,#0 821 add $rp,sp,#$S 822 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 823 824 mov $t0,$acc0 825 mov $t1,$acc1 826 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 827 mov $t2,$acc2 828 mov $t3,$acc3 829 ldp $a2,$a3,[sp,#$M+16] 830 add $rp,sp,#$tmp0 831 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); 832 833 add $rp,$rp_real,#0 834 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 835 836 add $bp,sp,#$tmp0 837 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 838 839 add $bp,sp,#$S 840 add $rp,sp,#$S 841 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 842 843 ldr $bi,[sp,#$M] 844 mov $a0,$acc0 // copy S 845 mov $a1,$acc1 846 mov $a2,$acc2 847 mov $a3,$acc3 848 add $bp,sp,#$M 849 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 850 851 add $bp,$rp_real,#32 852 add $rp,$rp_real,#32 853 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 854 855 add sp,x29,#0 // destroy frame 856 ldp x19,x20,[x29,#16] 857 ldp x21,x22,[x29,#32] 858 ldp x29,x30,[sp],#96 859 .inst 0xd50323bf // autiasp 860 ret 861.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 862___ 863} 864 865######################################################################## 866# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 867# const P256_POINT *in2); 868{ 869my ($res_x,$res_y,$res_z, 870 $H,$Hsqr,$R,$Rsqr,$Hcub, 871 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 872my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 873# above map() describes stack layout with 12 temporary 874# 256-bit vectors on top. 875my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); 876 877$code.=<<___; 878.globl ecp_nistz256_point_add 879.type ecp_nistz256_point_add,%function 880.align 5 881ecp_nistz256_point_add: 882 .inst 0xd503233f // paciasp 883 stp x29,x30,[sp,#-96]! 884 add x29,sp,#0 885 stp x19,x20,[sp,#16] 886 stp x21,x22,[sp,#32] 887 stp x23,x24,[sp,#48] 888 stp x25,x26,[sp,#64] 889 stp x27,x28,[sp,#80] 890 sub sp,sp,#32*12 891 892 ldp $a0,$a1,[$bp,#64] // in2_z 893 ldp $a2,$a3,[$bp,#64+16] 894 mov $rp_real,$rp 895 mov $ap_real,$ap 896 mov $bp_real,$bp 897 ldr $poly1,.Lpoly+8 898 ldr $poly3,.Lpoly+24 899 orr $t0,$a0,$a1 900 orr $t2,$a2,$a3 901 orr $in2infty,$t0,$t2 902 cmp $in2infty,#0 903 csetm $in2infty,ne // ~in2infty 904 add $rp,sp,#$Z2sqr 905 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 906 907 ldp $a0,$a1,[$ap_real,#64] // in1_z 908 ldp $a2,$a3,[$ap_real,#64+16] 909 orr $t0,$a0,$a1 910 orr $t2,$a2,$a3 911 orr $in1infty,$t0,$t2 912 cmp $in1infty,#0 913 csetm $in1infty,ne // ~in1infty 914 add $rp,sp,#$Z1sqr 915 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 916 917 ldr $bi,[$bp_real,#64] 918 ldp $a0,$a1,[sp,#$Z2sqr] 919 ldp $a2,$a3,[sp,#$Z2sqr+16] 920 add $bp,$bp_real,#64 921 add $rp,sp,#$S1 922 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 923 924 ldr $bi,[$ap_real,#64] 925 ldp $a0,$a1,[sp,#$Z1sqr] 926 ldp $a2,$a3,[sp,#$Z1sqr+16] 927 add $bp,$ap_real,#64 928 add $rp,sp,#$S2 929 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 930 931 ldr $bi,[$ap_real,#32] 932 ldp $a0,$a1,[sp,#$S1] 933 ldp $a2,$a3,[sp,#$S1+16] 934 add $bp,$ap_real,#32 935 add $rp,sp,#$S1 936 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 937 938 ldr $bi,[$bp_real,#32] 939 ldp $a0,$a1,[sp,#$S2] 940 ldp $a2,$a3,[sp,#$S2+16] 941 add $bp,$bp_real,#32 942 add $rp,sp,#$S2 943 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 944 945 add $bp,sp,#$S1 946 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 947 ldp $a0,$a1,[$ap_real] 948 ldp $a2,$a3,[$ap_real,#16] 949 add $rp,sp,#$R 950 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 951 952 orr $acc0,$acc0,$acc1 // see if result is zero 953 orr $acc2,$acc2,$acc3 954 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) 955 956 add $bp,sp,#$Z2sqr 957 add $rp,sp,#$U1 958 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 959 960 ldr $bi,[sp,#$Z1sqr] 961 ldp $a0,$a1,[$bp_real] 962 ldp $a2,$a3,[$bp_real,#16] 963 add $bp,sp,#$Z1sqr 964 add $rp,sp,#$U2 965 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 966 967 add $bp,sp,#$U1 968 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 969 ldp $a2,$a3,[sp,#$R+16] 970 add $rp,sp,#$H 971 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 972 973 orr $acc0,$acc0,$acc1 // see if result is zero 974 orr $acc2,$acc2,$acc3 975 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) 976 977 mvn $temp1,$in1infty // -1/0 -> 0/-1 978 mvn $temp2,$in2infty // -1/0 -> 0/-1 979 orr $acc0,$acc0,$temp1 980 orr $acc0,$acc0,$temp2 981 orr $acc0,$acc0,$temp0 982 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 983 984.Ladd_double: 985 mov $ap,$ap_real 986 mov $rp,$rp_real 987 ldp x23,x24,[x29,#48] 988 ldp x25,x26,[x29,#64] 989 ldp x27,x28,[x29,#80] 990 add sp,sp,#32*(12-4) // difference in stack frames 991 b .Ldouble_shortcut 992 993.align 4 994.Ladd_proceed: 995 add $rp,sp,#$Rsqr 996 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 997 998 ldr $bi,[$ap_real,#64] 999 ldp $a0,$a1,[sp,#$H] 1000 ldp $a2,$a3,[sp,#$H+16] 1001 add $bp,$ap_real,#64 1002 add $rp,sp,#$res_z 1003 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1004 1005 ldp $a0,$a1,[sp,#$H] 1006 ldp $a2,$a3,[sp,#$H+16] 1007 add $rp,sp,#$Hsqr 1008 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1009 1010 ldr $bi,[$bp_real,#64] 1011 ldp $a0,$a1,[sp,#$res_z] 1012 ldp $a2,$a3,[sp,#$res_z+16] 1013 add $bp,$bp_real,#64 1014 add $rp,sp,#$res_z 1015 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 1016 1017 ldr $bi,[sp,#$H] 1018 ldp $a0,$a1,[sp,#$Hsqr] 1019 ldp $a2,$a3,[sp,#$Hsqr+16] 1020 add $bp,sp,#$H 1021 add $rp,sp,#$Hcub 1022 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1023 1024 ldr $bi,[sp,#$Hsqr] 1025 ldp $a0,$a1,[sp,#$U1] 1026 ldp $a2,$a3,[sp,#$U1+16] 1027 add $bp,sp,#$Hsqr 1028 add $rp,sp,#$U2 1029 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 1030 1031 mov $t0,$acc0 1032 mov $t1,$acc1 1033 mov $t2,$acc2 1034 mov $t3,$acc3 1035 add $rp,sp,#$Hsqr 1036 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1037 1038 add $bp,sp,#$Rsqr 1039 add $rp,sp,#$res_x 1040 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1041 1042 add $bp,sp,#$Hcub 1043 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1044 1045 add $bp,sp,#$U2 1046 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 1047 ldp $a0,$a1,[sp,#$S1] 1048 ldp $a2,$a3,[sp,#$S1+16] 1049 add $rp,sp,#$res_y 1050 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1051 1052 add $bp,sp,#$Hcub 1053 add $rp,sp,#$S2 1054 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 1055 1056 ldr $bi,[sp,#$R] 1057 ldp $a0,$a1,[sp,#$res_y] 1058 ldp $a2,$a3,[sp,#$res_y+16] 1059 add $bp,sp,#$R 1060 add $rp,sp,#$res_y 1061 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1062 1063 add $bp,sp,#$S2 1064 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1065 1066 ldp $a0,$a1,[sp,#$res_x] // res 1067 ldp $a2,$a3,[sp,#$res_x+16] 1068 ldp $t0,$t1,[$bp_real] // in2 1069 ldp $t2,$t3,[$bp_real,#16] 1070___ 1071for($i=0;$i<64;$i+=32) { # conditional moves 1072$code.=<<___; 1073 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1074 cmp $in1infty,#0 // ~$in1intfy, remember? 1075 ldp $acc2,$acc3,[$ap_real,#$i+16] 1076 csel $t0,$a0,$t0,ne 1077 csel $t1,$a1,$t1,ne 1078 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1079 csel $t2,$a2,$t2,ne 1080 csel $t3,$a3,$t3,ne 1081 cmp $in2infty,#0 // ~$in2intfy, remember? 1082 ldp $a2,$a3,[sp,#$res_x+$i+48] 1083 csel $acc0,$t0,$acc0,ne 1084 csel $acc1,$t1,$acc1,ne 1085 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1086 csel $acc2,$t2,$acc2,ne 1087 csel $acc3,$t3,$acc3,ne 1088 ldp $t2,$t3,[$bp_real,#$i+48] 1089 stp $acc0,$acc1,[$rp_real,#$i] 1090 stp $acc2,$acc3,[$rp_real,#$i+16] 1091___ 1092} 1093$code.=<<___; 1094 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1095 cmp $in1infty,#0 // ~$in1intfy, remember? 1096 ldp $acc2,$acc3,[$ap_real,#$i+16] 1097 csel $t0,$a0,$t0,ne 1098 csel $t1,$a1,$t1,ne 1099 csel $t2,$a2,$t2,ne 1100 csel $t3,$a3,$t3,ne 1101 cmp $in2infty,#0 // ~$in2intfy, remember? 1102 csel $acc0,$t0,$acc0,ne 1103 csel $acc1,$t1,$acc1,ne 1104 csel $acc2,$t2,$acc2,ne 1105 csel $acc3,$t3,$acc3,ne 1106 stp $acc0,$acc1,[$rp_real,#$i] 1107 stp $acc2,$acc3,[$rp_real,#$i+16] 1108 1109.Ladd_done: 1110 add sp,x29,#0 // destroy frame 1111 ldp x19,x20,[x29,#16] 1112 ldp x21,x22,[x29,#32] 1113 ldp x23,x24,[x29,#48] 1114 ldp x25,x26,[x29,#64] 1115 ldp x27,x28,[x29,#80] 1116 ldp x29,x30,[sp],#96 1117 .inst 0xd50323bf // autiasp 1118 ret 1119.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1120___ 1121} 1122 1123######################################################################## 1124# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1125# const P256_POINT_AFFINE *in2); 1126{ 1127my ($res_x,$res_y,$res_z, 1128 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1129my $Z1sqr = $S2; 1130# above map() describes stack layout with 10 temporary 1131# 256-bit vectors on top. 1132my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 1133 1134$code.=<<___; 1135.globl ecp_nistz256_point_add_affine 1136.type ecp_nistz256_point_add_affine,%function 1137.align 5 1138ecp_nistz256_point_add_affine: 1139 .inst 0xd503233f // paciasp 1140 stp x29,x30,[sp,#-80]! 1141 add x29,sp,#0 1142 stp x19,x20,[sp,#16] 1143 stp x21,x22,[sp,#32] 1144 stp x23,x24,[sp,#48] 1145 stp x25,x26,[sp,#64] 1146 sub sp,sp,#32*10 1147 1148 mov $rp_real,$rp 1149 mov $ap_real,$ap 1150 mov $bp_real,$bp 1151 ldr $poly1,.Lpoly+8 1152 ldr $poly3,.Lpoly+24 1153 1154 ldp $a0,$a1,[$ap,#64] // in1_z 1155 ldp $a2,$a3,[$ap,#64+16] 1156 orr $t0,$a0,$a1 1157 orr $t2,$a2,$a3 1158 orr $in1infty,$t0,$t2 1159 cmp $in1infty,#0 1160 csetm $in1infty,ne // ~in1infty 1161 1162 ldp $acc0,$acc1,[$bp] // in2_x 1163 ldp $acc2,$acc3,[$bp,#16] 1164 ldp $t0,$t1,[$bp,#32] // in2_y 1165 ldp $t2,$t3,[$bp,#48] 1166 orr $acc0,$acc0,$acc1 1167 orr $acc2,$acc2,$acc3 1168 orr $t0,$t0,$t1 1169 orr $t2,$t2,$t3 1170 orr $acc0,$acc0,$acc2 1171 orr $t0,$t0,$t2 1172 orr $in2infty,$acc0,$t0 1173 cmp $in2infty,#0 1174 csetm $in2infty,ne // ~in2infty 1175 1176 add $rp,sp,#$Z1sqr 1177 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1178 1179 mov $a0,$acc0 1180 mov $a1,$acc1 1181 mov $a2,$acc2 1182 mov $a3,$acc3 1183 ldr $bi,[$bp_real] 1184 add $bp,$bp_real,#0 1185 add $rp,sp,#$U2 1186 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1187 1188 add $bp,$ap_real,#0 1189 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 1190 ldp $a0,$a1,[sp,#$Z1sqr] 1191 ldp $a2,$a3,[sp,#$Z1sqr+16] 1192 add $rp,sp,#$H 1193 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1194 1195 add $bp,$ap_real,#64 1196 add $rp,sp,#$S2 1197 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1198 1199 ldr $bi,[$ap_real,#64] 1200 ldp $a0,$a1,[sp,#$H] 1201 ldp $a2,$a3,[sp,#$H+16] 1202 add $bp,$ap_real,#64 1203 add $rp,sp,#$res_z 1204 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1205 1206 ldr $bi,[$bp_real,#32] 1207 ldp $a0,$a1,[sp,#$S2] 1208 ldp $a2,$a3,[sp,#$S2+16] 1209 add $bp,$bp_real,#32 1210 add $rp,sp,#$S2 1211 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1212 1213 add $bp,$ap_real,#32 1214 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1215 ldp $a2,$a3,[sp,#$H+16] 1216 add $rp,sp,#$R 1217 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1218 1219 add $rp,sp,#$Hsqr 1220 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1221 1222 ldp $a0,$a1,[sp,#$R] 1223 ldp $a2,$a3,[sp,#$R+16] 1224 add $rp,sp,#$Rsqr 1225 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1226 1227 ldr $bi,[sp,#$H] 1228 ldp $a0,$a1,[sp,#$Hsqr] 1229 ldp $a2,$a3,[sp,#$Hsqr+16] 1230 add $bp,sp,#$H 1231 add $rp,sp,#$Hcub 1232 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1233 1234 ldr $bi,[$ap_real] 1235 ldp $a0,$a1,[sp,#$Hsqr] 1236 ldp $a2,$a3,[sp,#$Hsqr+16] 1237 add $bp,$ap_real,#0 1238 add $rp,sp,#$U2 1239 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1240 1241 mov $t0,$acc0 1242 mov $t1,$acc1 1243 mov $t2,$acc2 1244 mov $t3,$acc3 1245 add $rp,sp,#$Hsqr 1246 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1247 1248 add $bp,sp,#$Rsqr 1249 add $rp,sp,#$res_x 1250 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1251 1252 add $bp,sp,#$Hcub 1253 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1254 1255 add $bp,sp,#$U2 1256 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1257 ldp $a0,$a1,[sp,#$Hcub] 1258 ldp $a2,$a3,[sp,#$Hcub+16] 1259 add $rp,sp,#$res_y 1260 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1261 1262 add $bp,$ap_real,#32 1263 add $rp,sp,#$S2 1264 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1265 1266 ldr $bi,[sp,#$R] 1267 ldp $a0,$a1,[sp,#$res_y] 1268 ldp $a2,$a3,[sp,#$res_y+16] 1269 add $bp,sp,#$R 1270 add $rp,sp,#$res_y 1271 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1272 1273 add $bp,sp,#$S2 1274 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1275 1276 ldp $a0,$a1,[sp,#$res_x] // res 1277 ldp $a2,$a3,[sp,#$res_x+16] 1278 ldp $t0,$t1,[$bp_real] // in2 1279 ldp $t2,$t3,[$bp_real,#16] 1280___ 1281for($i=0;$i<64;$i+=32) { # conditional moves 1282$code.=<<___; 1283 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1284 cmp $in1infty,#0 // ~$in1intfy, remember? 1285 ldp $acc2,$acc3,[$ap_real,#$i+16] 1286 csel $t0,$a0,$t0,ne 1287 csel $t1,$a1,$t1,ne 1288 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1289 csel $t2,$a2,$t2,ne 1290 csel $t3,$a3,$t3,ne 1291 cmp $in2infty,#0 // ~$in2intfy, remember? 1292 ldp $a2,$a3,[sp,#$res_x+$i+48] 1293 csel $acc0,$t0,$acc0,ne 1294 csel $acc1,$t1,$acc1,ne 1295 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1296 csel $acc2,$t2,$acc2,ne 1297 csel $acc3,$t3,$acc3,ne 1298 ldp $t2,$t3,[$bp_real,#$i+48] 1299 stp $acc0,$acc1,[$rp_real,#$i] 1300 stp $acc2,$acc3,[$rp_real,#$i+16] 1301___ 1302$code.=<<___ if ($i == 0); 1303 adr $bp_real,.Lone_mont-64 1304___ 1305} 1306$code.=<<___; 1307 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1308 cmp $in1infty,#0 // ~$in1intfy, remember? 1309 ldp $acc2,$acc3,[$ap_real,#$i+16] 1310 csel $t0,$a0,$t0,ne 1311 csel $t1,$a1,$t1,ne 1312 csel $t2,$a2,$t2,ne 1313 csel $t3,$a3,$t3,ne 1314 cmp $in2infty,#0 // ~$in2intfy, remember? 1315 csel $acc0,$t0,$acc0,ne 1316 csel $acc1,$t1,$acc1,ne 1317 csel $acc2,$t2,$acc2,ne 1318 csel $acc3,$t3,$acc3,ne 1319 stp $acc0,$acc1,[$rp_real,#$i] 1320 stp $acc2,$acc3,[$rp_real,#$i+16] 1321 1322 add sp,x29,#0 // destroy frame 1323 ldp x19,x20,[x29,#16] 1324 ldp x21,x22,[x29,#32] 1325 ldp x23,x24,[x29,#48] 1326 ldp x25,x26,[x29,#64] 1327 ldp x29,x30,[sp],#80 1328 .inst 0xd50323bf // autiasp 1329 ret 1330.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1331___ 1332} 1333if (1) { 1334my ($ord0,$ord1) = ($poly1,$poly3); 1335my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1336my $acc7 = $bi; 1337 1338$code.=<<___; 1339//////////////////////////////////////////////////////////////////////// 1340// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1341// uint64_t b[4]); 1342.globl ecp_nistz256_ord_mul_mont 1343.type ecp_nistz256_ord_mul_mont,%function 1344.align 4 1345ecp_nistz256_ord_mul_mont: 1346 stp x29,x30,[sp,#-64]! 1347 add x29,sp,#0 1348 stp x19,x20,[sp,#16] 1349 stp x21,x22,[sp,#32] 1350 stp x23,x24,[sp,#48] 1351 1352 adr $ordk,.Lord 1353 ldr $bi,[$bp] // bp[0] 1354 ldp $a0,$a1,[$ap] 1355 ldp $a2,$a3,[$ap,#16] 1356 1357 ldp $ord0,$ord1,[$ordk,#0] 1358 ldp $ord2,$ord3,[$ordk,#16] 1359 ldr $ordk,[$ordk,#32] 1360 1361 mul $acc0,$a0,$bi // a[0]*b[0] 1362 umulh $t0,$a0,$bi 1363 1364 mul $acc1,$a1,$bi // a[1]*b[0] 1365 umulh $t1,$a1,$bi 1366 1367 mul $acc2,$a2,$bi // a[2]*b[0] 1368 umulh $t2,$a2,$bi 1369 1370 mul $acc3,$a3,$bi // a[3]*b[0] 1371 umulh $acc4,$a3,$bi 1372 1373 mul $t4,$acc0,$ordk 1374 1375 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1376 adcs $acc2,$acc2,$t1 1377 adcs $acc3,$acc3,$t2 1378 adc $acc4,$acc4,xzr 1379 mov $acc5,xzr 1380___ 1381for ($i=1;$i<4;$i++) { 1382 ################################################################ 1383 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1384 # * abcdefgh 1385 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1386 # 1387 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1388 # rewrite above as: 1389 # 1390 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1391 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1392 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1393$code.=<<___; 1394 ldr $bi,[$bp,#8*$i] // b[i] 1395 1396 lsl $t0,$t4,#32 1397 subs $acc2,$acc2,$t4 1398 lsr $t1,$t4,#32 1399 sbcs $acc3,$acc3,$t0 1400 sbcs $acc4,$acc4,$t1 1401 sbc $acc5,$acc5,xzr 1402 1403 subs xzr,$acc0,#1 1404 umulh $t1,$ord0,$t4 1405 mul $t2,$ord1,$t4 1406 umulh $t3,$ord1,$t4 1407 1408 adcs $t2,$t2,$t1 1409 mul $t0,$a0,$bi 1410 adc $t3,$t3,xzr 1411 mul $t1,$a1,$bi 1412 1413 adds $acc0,$acc1,$t2 1414 mul $t2,$a2,$bi 1415 adcs $acc1,$acc2,$t3 1416 mul $t3,$a3,$bi 1417 adcs $acc2,$acc3,$t4 1418 adcs $acc3,$acc4,$t4 1419 adc $acc4,$acc5,xzr 1420 1421 adds $acc0,$acc0,$t0 // accumulate low parts 1422 umulh $t0,$a0,$bi 1423 adcs $acc1,$acc1,$t1 1424 umulh $t1,$a1,$bi 1425 adcs $acc2,$acc2,$t2 1426 umulh $t2,$a2,$bi 1427 adcs $acc3,$acc3,$t3 1428 umulh $t3,$a3,$bi 1429 adc $acc4,$acc4,xzr 1430 mul $t4,$acc0,$ordk 1431 adds $acc1,$acc1,$t0 // accumulate high parts 1432 adcs $acc2,$acc2,$t1 1433 adcs $acc3,$acc3,$t2 1434 adcs $acc4,$acc4,$t3 1435 adc $acc5,xzr,xzr 1436___ 1437} 1438$code.=<<___; 1439 lsl $t0,$t4,#32 // last reduction 1440 subs $acc2,$acc2,$t4 1441 lsr $t1,$t4,#32 1442 sbcs $acc3,$acc3,$t0 1443 sbcs $acc4,$acc4,$t1 1444 sbc $acc5,$acc5,xzr 1445 1446 subs xzr,$acc0,#1 1447 umulh $t1,$ord0,$t4 1448 mul $t2,$ord1,$t4 1449 umulh $t3,$ord1,$t4 1450 1451 adcs $t2,$t2,$t1 1452 adc $t3,$t3,xzr 1453 1454 adds $acc0,$acc1,$t2 1455 adcs $acc1,$acc2,$t3 1456 adcs $acc2,$acc3,$t4 1457 adcs $acc3,$acc4,$t4 1458 adc $acc4,$acc5,xzr 1459 1460 subs $t0,$acc0,$ord0 // ret -= modulus 1461 sbcs $t1,$acc1,$ord1 1462 sbcs $t2,$acc2,$ord2 1463 sbcs $t3,$acc3,$ord3 1464 sbcs xzr,$acc4,xzr 1465 1466 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1467 csel $acc1,$acc1,$t1,lo 1468 csel $acc2,$acc2,$t2,lo 1469 stp $acc0,$acc1,[$rp] 1470 csel $acc3,$acc3,$t3,lo 1471 stp $acc2,$acc3,[$rp,#16] 1472 1473 ldp x19,x20,[sp,#16] 1474 ldp x21,x22,[sp,#32] 1475 ldp x23,x24,[sp,#48] 1476 ldr x29,[sp],#64 1477 ret 1478.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1479 1480//////////////////////////////////////////////////////////////////////// 1481// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1482// int rep); 1483.globl ecp_nistz256_ord_sqr_mont 1484.type ecp_nistz256_ord_sqr_mont,%function 1485.align 4 1486ecp_nistz256_ord_sqr_mont: 1487 stp x29,x30,[sp,#-64]! 1488 add x29,sp,#0 1489 stp x19,x20,[sp,#16] 1490 stp x21,x22,[sp,#32] 1491 stp x23,x24,[sp,#48] 1492 1493 adr $ordk,.Lord 1494 ldp $a0,$a1,[$ap] 1495 ldp $a2,$a3,[$ap,#16] 1496 1497 ldp $ord0,$ord1,[$ordk,#0] 1498 ldp $ord2,$ord3,[$ordk,#16] 1499 ldr $ordk,[$ordk,#32] 1500 b .Loop_ord_sqr 1501 1502.align 4 1503.Loop_ord_sqr: 1504 sub $bp,$bp,#1 1505 //////////////////////////////////////////////////////////////// 1506 // | | | | | |a1*a0| | 1507 // | | | | |a2*a0| | | 1508 // | |a3*a2|a3*a0| | | | 1509 // | | | |a2*a1| | | | 1510 // | | |a3*a1| | | | | 1511 // *| | | | | | | | 2| 1512 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1513 // |--+--+--+--+--+--+--+--| 1514 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1515 // 1516 // "can't overflow" below mark carrying into high part of 1517 // multiplication result, which can't overflow, because it 1518 // can never be all ones. 1519 1520 mul $acc1,$a1,$a0 // a[1]*a[0] 1521 umulh $t1,$a1,$a0 1522 mul $acc2,$a2,$a0 // a[2]*a[0] 1523 umulh $t2,$a2,$a0 1524 mul $acc3,$a3,$a0 // a[3]*a[0] 1525 umulh $acc4,$a3,$a0 1526 1527 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1528 mul $t0,$a2,$a1 // a[2]*a[1] 1529 umulh $t1,$a2,$a1 1530 adcs $acc3,$acc3,$t2 1531 mul $t2,$a3,$a1 // a[3]*a[1] 1532 umulh $t3,$a3,$a1 1533 adc $acc4,$acc4,xzr // can't overflow 1534 1535 mul $acc5,$a3,$a2 // a[3]*a[2] 1536 umulh $acc6,$a3,$a2 1537 1538 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1539 mul $acc0,$a0,$a0 // a[0]*a[0] 1540 adc $t2,$t3,xzr // can't overflow 1541 1542 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1543 umulh $a0,$a0,$a0 1544 adcs $acc4,$acc4,$t1 1545 mul $t1,$a1,$a1 // a[1]*a[1] 1546 adcs $acc5,$acc5,$t2 1547 umulh $a1,$a1,$a1 1548 adc $acc6,$acc6,xzr // can't overflow 1549 1550 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1551 mul $t2,$a2,$a2 // a[2]*a[2] 1552 adcs $acc2,$acc2,$acc2 1553 umulh $a2,$a2,$a2 1554 adcs $acc3,$acc3,$acc3 1555 mul $t3,$a3,$a3 // a[3]*a[3] 1556 adcs $acc4,$acc4,$acc4 1557 umulh $a3,$a3,$a3 1558 adcs $acc5,$acc5,$acc5 1559 adcs $acc6,$acc6,$acc6 1560 adc $acc7,xzr,xzr 1561 1562 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1563 mul $t4,$acc0,$ordk 1564 adcs $acc2,$acc2,$t1 1565 adcs $acc3,$acc3,$a1 1566 adcs $acc4,$acc4,$t2 1567 adcs $acc5,$acc5,$a2 1568 adcs $acc6,$acc6,$t3 1569 adc $acc7,$acc7,$a3 1570___ 1571for($i=0; $i<4; $i++) { # reductions 1572$code.=<<___; 1573 subs xzr,$acc0,#1 1574 umulh $t1,$ord0,$t4 1575 mul $t2,$ord1,$t4 1576 umulh $t3,$ord1,$t4 1577 1578 adcs $t2,$t2,$t1 1579 adc $t3,$t3,xzr 1580 1581 adds $acc0,$acc1,$t2 1582 adcs $acc1,$acc2,$t3 1583 adcs $acc2,$acc3,$t4 1584 adc $acc3,xzr,$t4 // can't overflow 1585___ 1586$code.=<<___ if ($i<3); 1587 mul $t3,$acc0,$ordk 1588___ 1589$code.=<<___; 1590 lsl $t0,$t4,#32 1591 subs $acc1,$acc1,$t4 1592 lsr $t1,$t4,#32 1593 sbcs $acc2,$acc2,$t0 1594 sbc $acc3,$acc3,$t1 // can't borrow 1595___ 1596 ($t3,$t4) = ($t4,$t3); 1597} 1598$code.=<<___; 1599 adds $acc0,$acc0,$acc4 // accumulate upper half 1600 adcs $acc1,$acc1,$acc5 1601 adcs $acc2,$acc2,$acc6 1602 adcs $acc3,$acc3,$acc7 1603 adc $acc4,xzr,xzr 1604 1605 subs $t0,$acc0,$ord0 // ret -= modulus 1606 sbcs $t1,$acc1,$ord1 1607 sbcs $t2,$acc2,$ord2 1608 sbcs $t3,$acc3,$ord3 1609 sbcs xzr,$acc4,xzr 1610 1611 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1612 csel $a1,$acc1,$t1,lo 1613 csel $a2,$acc2,$t2,lo 1614 csel $a3,$acc3,$t3,lo 1615 1616 cbnz $bp,.Loop_ord_sqr 1617 1618 stp $a0,$a1,[$rp] 1619 stp $a2,$a3,[$rp,#16] 1620 1621 ldp x19,x20,[sp,#16] 1622 ldp x21,x22,[sp,#32] 1623 ldp x23,x24,[sp,#48] 1624 ldr x29,[sp],#64 1625 ret 1626.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1627___ 1628} } 1629 1630######################################################################## 1631# scatter-gather subroutines 1632{ 1633my ($out,$inp,$index,$mask)=map("x$_",(0..3)); 1634$code.=<<___; 1635// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, 1636// int x2); 1637.globl ecp_nistz256_scatter_w5 1638.type ecp_nistz256_scatter_w5,%function 1639.align 4 1640ecp_nistz256_scatter_w5: 1641 stp x29,x30,[sp,#-16]! 1642 add x29,sp,#0 1643 1644 add $out,$out,$index,lsl#2 1645 1646 ldp x4,x5,[$inp] // X 1647 ldp x6,x7,[$inp,#16] 1648 str w4,[$out,#64*0-4] 1649 lsr x4,x4,#32 1650 str w5,[$out,#64*1-4] 1651 lsr x5,x5,#32 1652 str w6,[$out,#64*2-4] 1653 lsr x6,x6,#32 1654 str w7,[$out,#64*3-4] 1655 lsr x7,x7,#32 1656 str w4,[$out,#64*4-4] 1657 str w5,[$out,#64*5-4] 1658 str w6,[$out,#64*6-4] 1659 str w7,[$out,#64*7-4] 1660 add $out,$out,#64*8 1661 1662 ldp x4,x5,[$inp,#32] // Y 1663 ldp x6,x7,[$inp,#48] 1664 str w4,[$out,#64*0-4] 1665 lsr x4,x4,#32 1666 str w5,[$out,#64*1-4] 1667 lsr x5,x5,#32 1668 str w6,[$out,#64*2-4] 1669 lsr x6,x6,#32 1670 str w7,[$out,#64*3-4] 1671 lsr x7,x7,#32 1672 str w4,[$out,#64*4-4] 1673 str w5,[$out,#64*5-4] 1674 str w6,[$out,#64*6-4] 1675 str w7,[$out,#64*7-4] 1676 add $out,$out,#64*8 1677 1678 ldp x4,x5,[$inp,#64] // Z 1679 ldp x6,x7,[$inp,#80] 1680 str w4,[$out,#64*0-4] 1681 lsr x4,x4,#32 1682 str w5,[$out,#64*1-4] 1683 lsr x5,x5,#32 1684 str w6,[$out,#64*2-4] 1685 lsr x6,x6,#32 1686 str w7,[$out,#64*3-4] 1687 lsr x7,x7,#32 1688 str w4,[$out,#64*4-4] 1689 str w5,[$out,#64*5-4] 1690 str w6,[$out,#64*6-4] 1691 str w7,[$out,#64*7-4] 1692 1693 ldr x29,[sp],#16 1694 ret 1695.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1696 1697// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, 1698// int x2); 1699.globl ecp_nistz256_gather_w5 1700.type ecp_nistz256_gather_w5,%function 1701.align 4 1702ecp_nistz256_gather_w5: 1703 stp x29,x30,[sp,#-16]! 1704 add x29,sp,#0 1705 1706 cmp $index,xzr 1707 csetm x3,ne 1708 add $index,$index,x3 1709 add $inp,$inp,$index,lsl#2 1710 1711 ldr w4,[$inp,#64*0] 1712 ldr w5,[$inp,#64*1] 1713 ldr w6,[$inp,#64*2] 1714 ldr w7,[$inp,#64*3] 1715 ldr w8,[$inp,#64*4] 1716 ldr w9,[$inp,#64*5] 1717 ldr w10,[$inp,#64*6] 1718 ldr w11,[$inp,#64*7] 1719 add $inp,$inp,#64*8 1720 orr x4,x4,x8,lsl#32 1721 orr x5,x5,x9,lsl#32 1722 orr x6,x6,x10,lsl#32 1723 orr x7,x7,x11,lsl#32 1724 csel x4,x4,xzr,ne 1725 csel x5,x5,xzr,ne 1726 csel x6,x6,xzr,ne 1727 csel x7,x7,xzr,ne 1728 stp x4,x5,[$out] // X 1729 stp x6,x7,[$out,#16] 1730 1731 ldr w4,[$inp,#64*0] 1732 ldr w5,[$inp,#64*1] 1733 ldr w6,[$inp,#64*2] 1734 ldr w7,[$inp,#64*3] 1735 ldr w8,[$inp,#64*4] 1736 ldr w9,[$inp,#64*5] 1737 ldr w10,[$inp,#64*6] 1738 ldr w11,[$inp,#64*7] 1739 add $inp,$inp,#64*8 1740 orr x4,x4,x8,lsl#32 1741 orr x5,x5,x9,lsl#32 1742 orr x6,x6,x10,lsl#32 1743 orr x7,x7,x11,lsl#32 1744 csel x4,x4,xzr,ne 1745 csel x5,x5,xzr,ne 1746 csel x6,x6,xzr,ne 1747 csel x7,x7,xzr,ne 1748 stp x4,x5,[$out,#32] // Y 1749 stp x6,x7,[$out,#48] 1750 1751 ldr w4,[$inp,#64*0] 1752 ldr w5,[$inp,#64*1] 1753 ldr w6,[$inp,#64*2] 1754 ldr w7,[$inp,#64*3] 1755 ldr w8,[$inp,#64*4] 1756 ldr w9,[$inp,#64*5] 1757 ldr w10,[$inp,#64*6] 1758 ldr w11,[$inp,#64*7] 1759 orr x4,x4,x8,lsl#32 1760 orr x5,x5,x9,lsl#32 1761 orr x6,x6,x10,lsl#32 1762 orr x7,x7,x11,lsl#32 1763 csel x4,x4,xzr,ne 1764 csel x5,x5,xzr,ne 1765 csel x6,x6,xzr,ne 1766 csel x7,x7,xzr,ne 1767 stp x4,x5,[$out,#64] // Z 1768 stp x6,x7,[$out,#80] 1769 1770 ldr x29,[sp],#16 1771 ret 1772.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1773 1774// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, 1775// int x2); 1776.globl ecp_nistz256_scatter_w7 1777.type ecp_nistz256_scatter_w7,%function 1778.align 4 1779ecp_nistz256_scatter_w7: 1780 stp x29,x30,[sp,#-16]! 1781 add x29,sp,#0 1782 1783 add $out,$out,$index 1784 mov $index,#64/8 1785.Loop_scatter_w7: 1786 ldr x3,[$inp],#8 1787 subs $index,$index,#1 1788 prfm pstl1strm,[$out,#4096+64*0] 1789 prfm pstl1strm,[$out,#4096+64*1] 1790 prfm pstl1strm,[$out,#4096+64*2] 1791 prfm pstl1strm,[$out,#4096+64*3] 1792 prfm pstl1strm,[$out,#4096+64*4] 1793 prfm pstl1strm,[$out,#4096+64*5] 1794 prfm pstl1strm,[$out,#4096+64*6] 1795 prfm pstl1strm,[$out,#4096+64*7] 1796 strb w3,[$out,#64*0] 1797 lsr x3,x3,#8 1798 strb w3,[$out,#64*1] 1799 lsr x3,x3,#8 1800 strb w3,[$out,#64*2] 1801 lsr x3,x3,#8 1802 strb w3,[$out,#64*3] 1803 lsr x3,x3,#8 1804 strb w3,[$out,#64*4] 1805 lsr x3,x3,#8 1806 strb w3,[$out,#64*5] 1807 lsr x3,x3,#8 1808 strb w3,[$out,#64*6] 1809 lsr x3,x3,#8 1810 strb w3,[$out,#64*7] 1811 add $out,$out,#64*8 1812 b.ne .Loop_scatter_w7 1813 1814 ldr x29,[sp],#16 1815 ret 1816.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1817 1818// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, 1819// int x2); 1820.globl ecp_nistz256_gather_w7 1821.type ecp_nistz256_gather_w7,%function 1822.align 4 1823ecp_nistz256_gather_w7: 1824 stp x29,x30,[sp,#-16]! 1825 add x29,sp,#0 1826 1827 cmp $index,xzr 1828 csetm x3,ne 1829 add $index,$index,x3 1830 add $inp,$inp,$index 1831 mov $index,#64/8 1832 nop 1833.Loop_gather_w7: 1834 ldrb w4,[$inp,#64*0] 1835 prfm pldl1strm,[$inp,#4096+64*0] 1836 subs $index,$index,#1 1837 ldrb w5,[$inp,#64*1] 1838 prfm pldl1strm,[$inp,#4096+64*1] 1839 ldrb w6,[$inp,#64*2] 1840 prfm pldl1strm,[$inp,#4096+64*2] 1841 ldrb w7,[$inp,#64*3] 1842 prfm pldl1strm,[$inp,#4096+64*3] 1843 ldrb w8,[$inp,#64*4] 1844 prfm pldl1strm,[$inp,#4096+64*4] 1845 ldrb w9,[$inp,#64*5] 1846 prfm pldl1strm,[$inp,#4096+64*5] 1847 ldrb w10,[$inp,#64*6] 1848 prfm pldl1strm,[$inp,#4096+64*6] 1849 ldrb w11,[$inp,#64*7] 1850 prfm pldl1strm,[$inp,#4096+64*7] 1851 add $inp,$inp,#64*8 1852 orr x4,x4,x5,lsl#8 1853 orr x6,x6,x7,lsl#8 1854 orr x8,x8,x9,lsl#8 1855 orr x4,x4,x6,lsl#16 1856 orr x10,x10,x11,lsl#8 1857 orr x4,x4,x8,lsl#32 1858 orr x4,x4,x10,lsl#48 1859 and x4,x4,x3 1860 str x4,[$out],#8 1861 b.ne .Loop_gather_w7 1862 1863 ldr x29,[sp],#16 1864 ret 1865.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1866___ 1867} 1868 1869foreach (split("\n",$code)) { 1870 s/\`([^\`]*)\`/eval $1/ge; 1871 1872 print $_,"\n"; 1873} 1874close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1875