1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for ARMv8. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone. 22# 23# IALU/gcc-4.9 NEON 24# 25# Apple A7 1.86/+5% 0.72 26# Cortex-A53 2.69/+58% 1.47 27# Cortex-A57 2.70/+7% 1.14 28# Denver 1.64/+50% 1.18(*) 29# X-Gene 2.13/+68% 2.27 30# Mongoose 1.77/+75% 1.12 31# Kryo 2.70/+55% 1.13 32# 33# (*) estimate based on resources availability is less than 1.0, 34# i.e. measured result is worse than expected, presumably binary 35# translator is not almighty; 36 37$flavour=shift; 38$output=shift; 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 43die "can't locate arm-xlate.pl"; 44 45open OUT,"| \"$^X\" $xlate $flavour $output"; 46*STDOUT=*OUT; 47 48my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 49my ($mac,$nonce)=($inp,$len); 50 51my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 52 53$code.=<<___; 54#include "arm_arch.h" 55 56.text 57 58// forward "declarations" are required for Apple 59.extern OPENSSL_armcap_P 60.hidden OPENSSL_armcap_P 61.globl poly1305_init 62.hidden poly1305_init 63.globl poly1305_blocks 64.hidden poly1305_blocks 65.globl poly1305_emit 66.hidden poly1305_emit 67 68.type poly1305_init,%function 69.align 5 70poly1305_init: 71 cmp $inp,xzr 72 stp xzr,xzr,[$ctx] // zero hash value 73 stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 74 75 csel x0,xzr,x0,eq 76 b.eq .Lno_key 77 78#ifdef __ILP32__ 79 ldrsw $t1,.LOPENSSL_armcap_P 80#else 81 ldr $t1,.LOPENSSL_armcap_P 82#endif 83 adr $t0,.LOPENSSL_armcap_P 84 85 ldp $r0,$r1,[$inp] // load key 86 mov $s1,#0xfffffffc0fffffff 87 movk $s1,#0x0fff,lsl#48 88 ldr w17,[$t0,$t1] 89#ifdef __ARMEB__ 90 rev $r0,$r0 // flip bytes 91 rev $r1,$r1 92#endif 93 and $r0,$r0,$s1 // &=0ffffffc0fffffff 94 and $s1,$s1,#-4 95 and $r1,$r1,$s1 // &=0ffffffc0ffffffc 96 stp $r0,$r1,[$ctx,#32] // save key value 97 98 tst w17,#ARMV7_NEON 99 100 adr $d0,poly1305_blocks 101 adr $r0,poly1305_blocks_neon 102 adr $d1,poly1305_emit 103 adr $r1,poly1305_emit_neon 104 105 csel $d0,$d0,$r0,eq 106 csel $d1,$d1,$r1,eq 107 108#ifdef __ILP32__ 109 stp w12,w13,[$len] 110#else 111 stp $d0,$d1,[$len] 112#endif 113 114 mov x0,#1 115.Lno_key: 116 ret 117.size poly1305_init,.-poly1305_init 118 119.type poly1305_blocks,%function 120.align 5 121poly1305_blocks: 122 ands $len,$len,#-16 123 b.eq .Lno_data 124 125 ldp $h0,$h1,[$ctx] // load hash value 126 ldp $r0,$r1,[$ctx,#32] // load key value 127 ldr $h2,[$ctx,#16] 128 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 129 b .Loop 130 131.align 5 132.Loop: 133 ldp $t0,$t1,[$inp],#16 // load input 134 sub $len,$len,#16 135#ifdef __ARMEB__ 136 rev $t0,$t0 137 rev $t1,$t1 138#endif 139 adds $h0,$h0,$t0 // accumulate input 140 adcs $h1,$h1,$t1 141 142 mul $d0,$h0,$r0 // h0*r0 143 adc $h2,$h2,$padbit 144 umulh $d1,$h0,$r0 145 146 mul $t0,$h1,$s1 // h1*5*r1 147 umulh $t1,$h1,$s1 148 149 adds $d0,$d0,$t0 150 mul $t0,$h0,$r1 // h0*r1 151 adc $d1,$d1,$t1 152 umulh $d2,$h0,$r1 153 154 adds $d1,$d1,$t0 155 mul $t0,$h1,$r0 // h1*r0 156 adc $d2,$d2,xzr 157 umulh $t1,$h1,$r0 158 159 adds $d1,$d1,$t0 160 mul $t0,$h2,$s1 // h2*5*r1 161 adc $d2,$d2,$t1 162 mul $t1,$h2,$r0 // h2*r0 163 164 adds $d1,$d1,$t0 165 adc $d2,$d2,$t1 166 167 and $t0,$d2,#-4 // final reduction 168 and $h2,$d2,#3 169 add $t0,$t0,$d2,lsr#2 170 adds $h0,$d0,$t0 171 adcs $h1,$d1,xzr 172 adc $h2,$h2,xzr 173 174 cbnz $len,.Loop 175 176 stp $h0,$h1,[$ctx] // store hash value 177 str $h2,[$ctx,#16] 178 179.Lno_data: 180 ret 181.size poly1305_blocks,.-poly1305_blocks 182 183.type poly1305_emit,%function 184.align 5 185poly1305_emit: 186 ldp $h0,$h1,[$ctx] // load hash base 2^64 187 ldr $h2,[$ctx,#16] 188 ldp $t0,$t1,[$nonce] // load nonce 189 190 adds $d0,$h0,#5 // compare to modulus 191 adcs $d1,$h1,xzr 192 adc $d2,$h2,xzr 193 194 tst $d2,#-4 // see if it's carried/borrowed 195 196 csel $h0,$h0,$d0,eq 197 csel $h1,$h1,$d1,eq 198 199#ifdef __ARMEB__ 200 ror $t0,$t0,#32 // flip nonce words 201 ror $t1,$t1,#32 202#endif 203 adds $h0,$h0,$t0 // accumulate nonce 204 adc $h1,$h1,$t1 205#ifdef __ARMEB__ 206 rev $h0,$h0 // flip output bytes 207 rev $h1,$h1 208#endif 209 stp $h0,$h1,[$mac] // write result 210 211 ret 212.size poly1305_emit,.-poly1305_emit 213___ 214my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 215my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 216my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 217my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 218my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 219my ($T0,$T1,$MASK) = map("v$_",(29..31)); 220 221my ($in2,$zeros)=("x16","x17"); 222my $is_base2_26 = $zeros; # borrow 223 224$code.=<<___; 225.type poly1305_mult,%function 226.align 5 227poly1305_mult: 228 mul $d0,$h0,$r0 // h0*r0 229 umulh $d1,$h0,$r0 230 231 mul $t0,$h1,$s1 // h1*5*r1 232 umulh $t1,$h1,$s1 233 234 adds $d0,$d0,$t0 235 mul $t0,$h0,$r1 // h0*r1 236 adc $d1,$d1,$t1 237 umulh $d2,$h0,$r1 238 239 adds $d1,$d1,$t0 240 mul $t0,$h1,$r0 // h1*r0 241 adc $d2,$d2,xzr 242 umulh $t1,$h1,$r0 243 244 adds $d1,$d1,$t0 245 mul $t0,$h2,$s1 // h2*5*r1 246 adc $d2,$d2,$t1 247 mul $t1,$h2,$r0 // h2*r0 248 249 adds $d1,$d1,$t0 250 adc $d2,$d2,$t1 251 252 and $t0,$d2,#-4 // final reduction 253 and $h2,$d2,#3 254 add $t0,$t0,$d2,lsr#2 255 adds $h0,$d0,$t0 256 adcs $h1,$d1,xzr 257 adc $h2,$h2,xzr 258 259 ret 260.size poly1305_mult,.-poly1305_mult 261 262.type poly1305_splat,%function 263.align 5 264poly1305_splat: 265 and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 266 ubfx x13,$h0,#26,#26 267 extr x14,$h1,$h0,#52 268 and x14,x14,#0x03ffffff 269 ubfx x15,$h1,#14,#26 270 extr x16,$h2,$h1,#40 271 272 str w12,[$ctx,#16*0] // r0 273 add w12,w13,w13,lsl#2 // r1*5 274 str w13,[$ctx,#16*1] // r1 275 add w13,w14,w14,lsl#2 // r2*5 276 str w12,[$ctx,#16*2] // s1 277 str w14,[$ctx,#16*3] // r2 278 add w14,w15,w15,lsl#2 // r3*5 279 str w13,[$ctx,#16*4] // s2 280 str w15,[$ctx,#16*5] // r3 281 add w15,w16,w16,lsl#2 // r4*5 282 str w14,[$ctx,#16*6] // s3 283 str w16,[$ctx,#16*7] // r4 284 str w15,[$ctx,#16*8] // s4 285 286 ret 287.size poly1305_splat,.-poly1305_splat 288 289.type poly1305_blocks_neon,%function 290.align 5 291poly1305_blocks_neon: 292 ldr $is_base2_26,[$ctx,#24] 293 cmp $len,#128 294 b.hs .Lblocks_neon 295 cbz $is_base2_26,poly1305_blocks 296 297.Lblocks_neon: 298 .inst 0xd503233f // paciasp 299 stp x29,x30,[sp,#-80]! 300 add x29,sp,#0 301 302 ands $len,$len,#-16 303 b.eq .Lno_data_neon 304 305 cbz $is_base2_26,.Lbase2_64_neon 306 307 ldp w10,w11,[$ctx] // load hash value base 2^26 308 ldp w12,w13,[$ctx,#8] 309 ldr w14,[$ctx,#16] 310 311 tst $len,#31 312 b.eq .Leven_neon 313 314 ldp $r0,$r1,[$ctx,#32] // load key value 315 316 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 317 lsr $h1,x12,#12 318 adds $h0,$h0,x12,lsl#52 319 add $h1,$h1,x13,lsl#14 320 adc $h1,$h1,xzr 321 lsr $h2,x14,#24 322 adds $h1,$h1,x14,lsl#40 323 adc $d2,$h2,xzr // can be partially reduced... 324 325 ldp $d0,$d1,[$inp],#16 // load input 326 sub $len,$len,#16 327 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 328 329 and $t0,$d2,#-4 // ... so reduce 330 and $h2,$d2,#3 331 add $t0,$t0,$d2,lsr#2 332 adds $h0,$h0,$t0 333 adcs $h1,$h1,xzr 334 adc $h2,$h2,xzr 335 336#ifdef __ARMEB__ 337 rev $d0,$d0 338 rev $d1,$d1 339#endif 340 adds $h0,$h0,$d0 // accumulate input 341 adcs $h1,$h1,$d1 342 adc $h2,$h2,$padbit 343 344 bl poly1305_mult 345 ldr x30,[sp,#8] 346 347 cbz $padbit,.Lstore_base2_64_neon 348 349 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 350 ubfx x11,$h0,#26,#26 351 extr x12,$h1,$h0,#52 352 and x12,x12,#0x03ffffff 353 ubfx x13,$h1,#14,#26 354 extr x14,$h2,$h1,#40 355 356 cbnz $len,.Leven_neon 357 358 stp w10,w11,[$ctx] // store hash value base 2^26 359 stp w12,w13,[$ctx,#8] 360 str w14,[$ctx,#16] 361 b .Lno_data_neon 362 363.align 4 364.Lstore_base2_64_neon: 365 stp $h0,$h1,[$ctx] // store hash value base 2^64 366 stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed 367 b .Lno_data_neon 368 369.align 4 370.Lbase2_64_neon: 371 ldp $r0,$r1,[$ctx,#32] // load key value 372 373 ldp $h0,$h1,[$ctx] // load hash value base 2^64 374 ldr $h2,[$ctx,#16] 375 376 tst $len,#31 377 b.eq .Linit_neon 378 379 ldp $d0,$d1,[$inp],#16 // load input 380 sub $len,$len,#16 381 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 382#ifdef __ARMEB__ 383 rev $d0,$d0 384 rev $d1,$d1 385#endif 386 adds $h0,$h0,$d0 // accumulate input 387 adcs $h1,$h1,$d1 388 adc $h2,$h2,$padbit 389 390 bl poly1305_mult 391 392.Linit_neon: 393 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 394 ubfx x11,$h0,#26,#26 395 extr x12,$h1,$h0,#52 396 and x12,x12,#0x03ffffff 397 ubfx x13,$h1,#14,#26 398 extr x14,$h2,$h1,#40 399 400 stp d8,d9,[sp,#16] // meet ABI requirements 401 stp d10,d11,[sp,#32] 402 stp d12,d13,[sp,#48] 403 stp d14,d15,[sp,#64] 404 405 fmov ${H0},x10 406 fmov ${H1},x11 407 fmov ${H2},x12 408 fmov ${H3},x13 409 fmov ${H4},x14 410 411 ////////////////////////////////// initialize r^n table 412 mov $h0,$r0 // r^1 413 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 414 mov $h1,$r1 415 mov $h2,xzr 416 add $ctx,$ctx,#48+12 417 bl poly1305_splat 418 419 bl poly1305_mult // r^2 420 sub $ctx,$ctx,#4 421 bl poly1305_splat 422 423 bl poly1305_mult // r^3 424 sub $ctx,$ctx,#4 425 bl poly1305_splat 426 427 bl poly1305_mult // r^4 428 sub $ctx,$ctx,#4 429 bl poly1305_splat 430 ldr x30,[sp,#8] 431 432 add $in2,$inp,#32 433 adr $zeros,.Lzeros 434 subs $len,$len,#64 435 csel $in2,$zeros,$in2,lo 436 437 mov x4,#1 438 str x4,[$ctx,#-24] // set is_base2_26 439 sub $ctx,$ctx,#48 // restore original $ctx 440 b .Ldo_neon 441 442.align 4 443.Leven_neon: 444 add $in2,$inp,#32 445 adr $zeros,.Lzeros 446 subs $len,$len,#64 447 csel $in2,$zeros,$in2,lo 448 449 stp d8,d9,[sp,#16] // meet ABI requirements 450 stp d10,d11,[sp,#32] 451 stp d12,d13,[sp,#48] 452 stp d14,d15,[sp,#64] 453 454 fmov ${H0},x10 455 fmov ${H1},x11 456 fmov ${H2},x12 457 fmov ${H3},x13 458 fmov ${H4},x14 459 460.Ldo_neon: 461 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 462 ldp x9,x13,[$in2],#48 463 464 lsl $padbit,$padbit,#24 465 add x15,$ctx,#48 466 467#ifdef __ARMEB__ 468 rev x8,x8 469 rev x12,x12 470 rev x9,x9 471 rev x13,x13 472#endif 473 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 474 and x5,x9,#0x03ffffff 475 ubfx x6,x8,#26,#26 476 ubfx x7,x9,#26,#26 477 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 478 extr x8,x12,x8,#52 479 extr x9,x13,x9,#52 480 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 481 fmov $IN23_0,x4 482 and x8,x8,#0x03ffffff 483 and x9,x9,#0x03ffffff 484 ubfx x10,x12,#14,#26 485 ubfx x11,x13,#14,#26 486 add x12,$padbit,x12,lsr#40 487 add x13,$padbit,x13,lsr#40 488 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 489 fmov $IN23_1,x6 490 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 491 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 492 fmov $IN23_2,x8 493 fmov $IN23_3,x10 494 fmov $IN23_4,x12 495 496 ldp x8,x12,[$inp],#16 // inp[0:1] 497 ldp x9,x13,[$inp],#48 498 499 ld1 {$R0,$R1,$S1,$R2},[x15],#64 500 ld1 {$S2,$R3,$S3,$R4},[x15],#64 501 ld1 {$S4},[x15] 502 503#ifdef __ARMEB__ 504 rev x8,x8 505 rev x12,x12 506 rev x9,x9 507 rev x13,x13 508#endif 509 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 510 and x5,x9,#0x03ffffff 511 ubfx x6,x8,#26,#26 512 ubfx x7,x9,#26,#26 513 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 514 extr x8,x12,x8,#52 515 extr x9,x13,x9,#52 516 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 517 fmov $IN01_0,x4 518 and x8,x8,#0x03ffffff 519 and x9,x9,#0x03ffffff 520 ubfx x10,x12,#14,#26 521 ubfx x11,x13,#14,#26 522 add x12,$padbit,x12,lsr#40 523 add x13,$padbit,x13,lsr#40 524 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 525 fmov $IN01_1,x6 526 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 527 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 528 movi $MASK.2d,#-1 529 fmov $IN01_2,x8 530 fmov $IN01_3,x10 531 fmov $IN01_4,x12 532 ushr $MASK.2d,$MASK.2d,#38 533 534 b.ls .Lskip_loop 535 536.align 4 537.Loop_neon: 538 //////////////////////////////////////////////////////////////// 539 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 540 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 541 // \___________________/ 542 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 543 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 544 // \___________________/ \____________________/ 545 // 546 // Note that we start with inp[2:3]*r^2. This is because it 547 // doesn't depend on reduction in previous iteration. 548 //////////////////////////////////////////////////////////////// 549 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 550 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 551 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 552 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 553 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 554 555 subs $len,$len,#64 556 umull $ACC4,$IN23_0,${R4}[2] 557 csel $in2,$zeros,$in2,lo 558 umull $ACC3,$IN23_0,${R3}[2] 559 umull $ACC2,$IN23_0,${R2}[2] 560 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 561 umull $ACC1,$IN23_0,${R1}[2] 562 ldp x9,x13,[$in2],#48 563 umull $ACC0,$IN23_0,${R0}[2] 564#ifdef __ARMEB__ 565 rev x8,x8 566 rev x12,x12 567 rev x9,x9 568 rev x13,x13 569#endif 570 571 umlal $ACC4,$IN23_1,${R3}[2] 572 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 573 umlal $ACC3,$IN23_1,${R2}[2] 574 and x5,x9,#0x03ffffff 575 umlal $ACC2,$IN23_1,${R1}[2] 576 ubfx x6,x8,#26,#26 577 umlal $ACC1,$IN23_1,${R0}[2] 578 ubfx x7,x9,#26,#26 579 umlal $ACC0,$IN23_1,${S4}[2] 580 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 581 582 umlal $ACC4,$IN23_2,${R2}[2] 583 extr x8,x12,x8,#52 584 umlal $ACC3,$IN23_2,${R1}[2] 585 extr x9,x13,x9,#52 586 umlal $ACC2,$IN23_2,${R0}[2] 587 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 588 umlal $ACC1,$IN23_2,${S4}[2] 589 fmov $IN23_0,x4 590 umlal $ACC0,$IN23_2,${S3}[2] 591 and x8,x8,#0x03ffffff 592 593 umlal $ACC4,$IN23_3,${R1}[2] 594 and x9,x9,#0x03ffffff 595 umlal $ACC3,$IN23_3,${R0}[2] 596 ubfx x10,x12,#14,#26 597 umlal $ACC2,$IN23_3,${S4}[2] 598 ubfx x11,x13,#14,#26 599 umlal $ACC1,$IN23_3,${S3}[2] 600 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 601 umlal $ACC0,$IN23_3,${S2}[2] 602 fmov $IN23_1,x6 603 604 add $IN01_2,$IN01_2,$H2 605 add x12,$padbit,x12,lsr#40 606 umlal $ACC4,$IN23_4,${R0}[2] 607 add x13,$padbit,x13,lsr#40 608 umlal $ACC3,$IN23_4,${S4}[2] 609 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 610 umlal $ACC2,$IN23_4,${S3}[2] 611 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 612 umlal $ACC1,$IN23_4,${S2}[2] 613 fmov $IN23_2,x8 614 umlal $ACC0,$IN23_4,${S1}[2] 615 fmov $IN23_3,x10 616 617 //////////////////////////////////////////////////////////////// 618 // (hash+inp[0:1])*r^4 and accumulate 619 620 add $IN01_0,$IN01_0,$H0 621 fmov $IN23_4,x12 622 umlal $ACC3,$IN01_2,${R1}[0] 623 ldp x8,x12,[$inp],#16 // inp[0:1] 624 umlal $ACC0,$IN01_2,${S3}[0] 625 ldp x9,x13,[$inp],#48 626 umlal $ACC4,$IN01_2,${R2}[0] 627 umlal $ACC1,$IN01_2,${S4}[0] 628 umlal $ACC2,$IN01_2,${R0}[0] 629#ifdef __ARMEB__ 630 rev x8,x8 631 rev x12,x12 632 rev x9,x9 633 rev x13,x13 634#endif 635 636 add $IN01_1,$IN01_1,$H1 637 umlal $ACC3,$IN01_0,${R3}[0] 638 umlal $ACC4,$IN01_0,${R4}[0] 639 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 640 umlal $ACC2,$IN01_0,${R2}[0] 641 and x5,x9,#0x03ffffff 642 umlal $ACC0,$IN01_0,${R0}[0] 643 ubfx x6,x8,#26,#26 644 umlal $ACC1,$IN01_0,${R1}[0] 645 ubfx x7,x9,#26,#26 646 647 add $IN01_3,$IN01_3,$H3 648 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 649 umlal $ACC3,$IN01_1,${R2}[0] 650 extr x8,x12,x8,#52 651 umlal $ACC4,$IN01_1,${R3}[0] 652 extr x9,x13,x9,#52 653 umlal $ACC0,$IN01_1,${S4}[0] 654 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 655 umlal $ACC2,$IN01_1,${R1}[0] 656 fmov $IN01_0,x4 657 umlal $ACC1,$IN01_1,${R0}[0] 658 and x8,x8,#0x03ffffff 659 660 add $IN01_4,$IN01_4,$H4 661 and x9,x9,#0x03ffffff 662 umlal $ACC3,$IN01_3,${R0}[0] 663 ubfx x10,x12,#14,#26 664 umlal $ACC0,$IN01_3,${S2}[0] 665 ubfx x11,x13,#14,#26 666 umlal $ACC4,$IN01_3,${R1}[0] 667 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 668 umlal $ACC1,$IN01_3,${S3}[0] 669 fmov $IN01_1,x6 670 umlal $ACC2,$IN01_3,${S4}[0] 671 add x12,$padbit,x12,lsr#40 672 673 umlal $ACC3,$IN01_4,${S4}[0] 674 add x13,$padbit,x13,lsr#40 675 umlal $ACC0,$IN01_4,${S1}[0] 676 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 677 umlal $ACC4,$IN01_4,${R0}[0] 678 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 679 umlal $ACC1,$IN01_4,${S2}[0] 680 fmov $IN01_2,x8 681 umlal $ACC2,$IN01_4,${S3}[0] 682 fmov $IN01_3,x10 683 fmov $IN01_4,x12 684 685 ///////////////////////////////////////////////////////////////// 686 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 687 // and P. Schwabe 688 // 689 // [see discussion in poly1305-armv4 module] 690 691 ushr $T0.2d,$ACC3,#26 692 xtn $H3,$ACC3 693 ushr $T1.2d,$ACC0,#26 694 and $ACC0,$ACC0,$MASK.2d 695 add $ACC4,$ACC4,$T0.2d // h3 -> h4 696 bic $H3,#0xfc,lsl#24 // &=0x03ffffff 697 add $ACC1,$ACC1,$T1.2d // h0 -> h1 698 699 ushr $T0.2d,$ACC4,#26 700 xtn $H4,$ACC4 701 ushr $T1.2d,$ACC1,#26 702 xtn $H1,$ACC1 703 bic $H4,#0xfc,lsl#24 704 add $ACC2,$ACC2,$T1.2d // h1 -> h2 705 706 add $ACC0,$ACC0,$T0.2d 707 shl $T0.2d,$T0.2d,#2 708 shrn $T1.2s,$ACC2,#26 709 xtn $H2,$ACC2 710 add $ACC0,$ACC0,$T0.2d // h4 -> h0 711 bic $H1,#0xfc,lsl#24 712 add $H3,$H3,$T1.2s // h2 -> h3 713 bic $H2,#0xfc,lsl#24 714 715 shrn $T0.2s,$ACC0,#26 716 xtn $H0,$ACC0 717 ushr $T1.2s,$H3,#26 718 bic $H3,#0xfc,lsl#24 719 bic $H0,#0xfc,lsl#24 720 add $H1,$H1,$T0.2s // h0 -> h1 721 add $H4,$H4,$T1.2s // h3 -> h4 722 723 b.hi .Loop_neon 724 725.Lskip_loop: 726 dup $IN23_2,${IN23_2}[0] 727 add $IN01_2,$IN01_2,$H2 728 729 //////////////////////////////////////////////////////////////// 730 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 731 732 adds $len,$len,#32 733 b.ne .Long_tail 734 735 dup $IN23_2,${IN01_2}[0] 736 add $IN23_0,$IN01_0,$H0 737 add $IN23_3,$IN01_3,$H3 738 add $IN23_1,$IN01_1,$H1 739 add $IN23_4,$IN01_4,$H4 740 741.Long_tail: 742 dup $IN23_0,${IN23_0}[0] 743 umull2 $ACC0,$IN23_2,${S3} 744 umull2 $ACC3,$IN23_2,${R1} 745 umull2 $ACC4,$IN23_2,${R2} 746 umull2 $ACC2,$IN23_2,${R0} 747 umull2 $ACC1,$IN23_2,${S4} 748 749 dup $IN23_1,${IN23_1}[0] 750 umlal2 $ACC0,$IN23_0,${R0} 751 umlal2 $ACC2,$IN23_0,${R2} 752 umlal2 $ACC3,$IN23_0,${R3} 753 umlal2 $ACC4,$IN23_0,${R4} 754 umlal2 $ACC1,$IN23_0,${R1} 755 756 dup $IN23_3,${IN23_3}[0] 757 umlal2 $ACC0,$IN23_1,${S4} 758 umlal2 $ACC3,$IN23_1,${R2} 759 umlal2 $ACC2,$IN23_1,${R1} 760 umlal2 $ACC4,$IN23_1,${R3} 761 umlal2 $ACC1,$IN23_1,${R0} 762 763 dup $IN23_4,${IN23_4}[0] 764 umlal2 $ACC3,$IN23_3,${R0} 765 umlal2 $ACC4,$IN23_3,${R1} 766 umlal2 $ACC0,$IN23_3,${S2} 767 umlal2 $ACC1,$IN23_3,${S3} 768 umlal2 $ACC2,$IN23_3,${S4} 769 770 umlal2 $ACC3,$IN23_4,${S4} 771 umlal2 $ACC0,$IN23_4,${S1} 772 umlal2 $ACC4,$IN23_4,${R0} 773 umlal2 $ACC1,$IN23_4,${S2} 774 umlal2 $ACC2,$IN23_4,${S3} 775 776 b.eq .Lshort_tail 777 778 //////////////////////////////////////////////////////////////// 779 // (hash+inp[0:1])*r^4:r^3 and accumulate 780 781 add $IN01_0,$IN01_0,$H0 782 umlal $ACC3,$IN01_2,${R1} 783 umlal $ACC0,$IN01_2,${S3} 784 umlal $ACC4,$IN01_2,${R2} 785 umlal $ACC1,$IN01_2,${S4} 786 umlal $ACC2,$IN01_2,${R0} 787 788 add $IN01_1,$IN01_1,$H1 789 umlal $ACC3,$IN01_0,${R3} 790 umlal $ACC0,$IN01_0,${R0} 791 umlal $ACC4,$IN01_0,${R4} 792 umlal $ACC1,$IN01_0,${R1} 793 umlal $ACC2,$IN01_0,${R2} 794 795 add $IN01_3,$IN01_3,$H3 796 umlal $ACC3,$IN01_1,${R2} 797 umlal $ACC0,$IN01_1,${S4} 798 umlal $ACC4,$IN01_1,${R3} 799 umlal $ACC1,$IN01_1,${R0} 800 umlal $ACC2,$IN01_1,${R1} 801 802 add $IN01_4,$IN01_4,$H4 803 umlal $ACC3,$IN01_3,${R0} 804 umlal $ACC0,$IN01_3,${S2} 805 umlal $ACC4,$IN01_3,${R1} 806 umlal $ACC1,$IN01_3,${S3} 807 umlal $ACC2,$IN01_3,${S4} 808 809 umlal $ACC3,$IN01_4,${S4} 810 umlal $ACC0,$IN01_4,${S1} 811 umlal $ACC4,$IN01_4,${R0} 812 umlal $ACC1,$IN01_4,${S2} 813 umlal $ACC2,$IN01_4,${S3} 814 815.Lshort_tail: 816 //////////////////////////////////////////////////////////////// 817 // horizontal add 818 819 addp $ACC3,$ACC3,$ACC3 820 ldp d8,d9,[sp,#16] // meet ABI requirements 821 addp $ACC0,$ACC0,$ACC0 822 ldp d10,d11,[sp,#32] 823 addp $ACC4,$ACC4,$ACC4 824 ldp d12,d13,[sp,#48] 825 addp $ACC1,$ACC1,$ACC1 826 ldp d14,d15,[sp,#64] 827 addp $ACC2,$ACC2,$ACC2 828 829 //////////////////////////////////////////////////////////////// 830 // lazy reduction, but without narrowing 831 832 ushr $T0.2d,$ACC3,#26 833 and $ACC3,$ACC3,$MASK.2d 834 ushr $T1.2d,$ACC0,#26 835 and $ACC0,$ACC0,$MASK.2d 836 837 add $ACC4,$ACC4,$T0.2d // h3 -> h4 838 add $ACC1,$ACC1,$T1.2d // h0 -> h1 839 840 ushr $T0.2d,$ACC4,#26 841 and $ACC4,$ACC4,$MASK.2d 842 ushr $T1.2d,$ACC1,#26 843 and $ACC1,$ACC1,$MASK.2d 844 add $ACC2,$ACC2,$T1.2d // h1 -> h2 845 846 add $ACC0,$ACC0,$T0.2d 847 shl $T0.2d,$T0.2d,#2 848 ushr $T1.2d,$ACC2,#26 849 and $ACC2,$ACC2,$MASK.2d 850 add $ACC0,$ACC0,$T0.2d // h4 -> h0 851 add $ACC3,$ACC3,$T1.2d // h2 -> h3 852 853 ushr $T0.2d,$ACC0,#26 854 and $ACC0,$ACC0,$MASK.2d 855 ushr $T1.2d,$ACC3,#26 856 and $ACC3,$ACC3,$MASK.2d 857 add $ACC1,$ACC1,$T0.2d // h0 -> h1 858 add $ACC4,$ACC4,$T1.2d // h3 -> h4 859 860 //////////////////////////////////////////////////////////////// 861 // write the result, can be partially reduced 862 863 st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 864 st1 {$ACC4}[0],[$ctx] 865 866.Lno_data_neon: 867 ldr x29,[sp],#80 868 .inst 0xd50323bf // autiasp 869 ret 870.size poly1305_blocks_neon,.-poly1305_blocks_neon 871 872.type poly1305_emit_neon,%function 873.align 5 874poly1305_emit_neon: 875 ldr $is_base2_26,[$ctx,#24] 876 cbz $is_base2_26,poly1305_emit 877 878 ldp w10,w11,[$ctx] // load hash value base 2^26 879 ldp w12,w13,[$ctx,#8] 880 ldr w14,[$ctx,#16] 881 882 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 883 lsr $h1,x12,#12 884 adds $h0,$h0,x12,lsl#52 885 add $h1,$h1,x13,lsl#14 886 adc $h1,$h1,xzr 887 lsr $h2,x14,#24 888 adds $h1,$h1,x14,lsl#40 889 adc $h2,$h2,xzr // can be partially reduced... 890 891 ldp $t0,$t1,[$nonce] // load nonce 892 893 and $d0,$h2,#-4 // ... so reduce 894 add $d0,$d0,$h2,lsr#2 895 and $h2,$h2,#3 896 adds $h0,$h0,$d0 897 adcs $h1,$h1,xzr 898 adc $h2,$h2,xzr 899 900 adds $d0,$h0,#5 // compare to modulus 901 adcs $d1,$h1,xzr 902 adc $d2,$h2,xzr 903 904 tst $d2,#-4 // see if it's carried/borrowed 905 906 csel $h0,$h0,$d0,eq 907 csel $h1,$h1,$d1,eq 908 909#ifdef __ARMEB__ 910 ror $t0,$t0,#32 // flip nonce words 911 ror $t1,$t1,#32 912#endif 913 adds $h0,$h0,$t0 // accumulate nonce 914 adc $h1,$h1,$t1 915#ifdef __ARMEB__ 916 rev $h0,$h0 // flip output bytes 917 rev $h1,$h1 918#endif 919 stp $h0,$h1,[$mac] // write result 920 921 ret 922.size poly1305_emit_neon,.-poly1305_emit_neon 923 924.align 5 925.Lzeros: 926.long 0,0,0,0,0,0,0,0 927.LOPENSSL_armcap_P: 928#ifdef __ILP32__ 929.long OPENSSL_armcap_P-. 930#else 931.quad OPENSSL_armcap_P-. 932#endif 933.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 934.align 2 935___ 936 937foreach (split("\n",$code)) { 938 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 939 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 940 (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 941 (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 942 (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 943 (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 944 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 945 946 s/\.[124]([sd])\[/.$1\[/; 947 948 print $_,"\n"; 949} 950close STDOUT or die "error closing STDOUT: $!"; 951