1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for ARMv8. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone. 22# 23# IALU/gcc-4.9 NEON 24# 25# Apple A7 1.86/+5% 0.72 26# Cortex-A53 2.69/+58% 1.47 27# Cortex-A57 2.70/+7% 1.14 28# Denver 1.64/+50% 1.18(*) 29# X-Gene 2.13/+68% 2.27 30# Mongoose 1.77/+75% 1.12 31# Kryo 2.70/+55% 1.13 32# 33# (*) estimate based on resources availability is less than 1.0, 34# i.e. measured result is worse than expected, presumably binary 35# translator is not almighty; 36 37$flavour=shift; 38$output=shift; 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 43die "can't locate arm-xlate.pl"; 44 45open OUT,"| \"$^X\" $xlate $flavour $output"; 46*STDOUT=*OUT; 47 48my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 49my ($mac,$nonce)=($inp,$len); 50 51my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 52 53$code.=<<___; 54#include "arm_arch.h" 55 56.text 57 58// forward "declarations" are required for Apple 59.extern OPENSSL_armcap_P 60.globl poly1305_blocks 61.globl poly1305_emit 62 63.globl poly1305_init 64.type poly1305_init,%function 65.align 5 66poly1305_init: 67 cmp $inp,xzr 68 stp xzr,xzr,[$ctx] // zero hash value 69 stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 70 71 csel x0,xzr,x0,eq 72 b.eq .Lno_key 73 74#ifdef __ILP32__ 75 ldrsw $t1,.LOPENSSL_armcap_P 76#else 77 ldr $t1,.LOPENSSL_armcap_P 78#endif 79 adr $t0,.LOPENSSL_armcap_P 80 81 ldp $r0,$r1,[$inp] // load key 82 mov $s1,#0xfffffffc0fffffff 83 movk $s1,#0x0fff,lsl#48 84 ldr w17,[$t0,$t1] 85#ifdef __ARMEB__ 86 rev $r0,$r0 // flip bytes 87 rev $r1,$r1 88#endif 89 and $r0,$r0,$s1 // &=0ffffffc0fffffff 90 and $s1,$s1,#-4 91 and $r1,$r1,$s1 // &=0ffffffc0ffffffc 92 stp $r0,$r1,[$ctx,#32] // save key value 93 94 tst w17,#ARMV7_NEON 95 96 adr $d0,poly1305_blocks 97 adr $r0,poly1305_blocks_neon 98 adr $d1,poly1305_emit 99 adr $r1,poly1305_emit_neon 100 101 csel $d0,$d0,$r0,eq 102 csel $d1,$d1,$r1,eq 103 104#ifdef __ILP32__ 105 stp w12,w13,[$len] 106#else 107 stp $d0,$d1,[$len] 108#endif 109 110 mov x0,#1 111.Lno_key: 112 ret 113.size poly1305_init,.-poly1305_init 114 115.type poly1305_blocks,%function 116.align 5 117poly1305_blocks: 118 ands $len,$len,#-16 119 b.eq .Lno_data 120 121 ldp $h0,$h1,[$ctx] // load hash value 122 ldp $r0,$r1,[$ctx,#32] // load key value 123 ldr $h2,[$ctx,#16] 124 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 125 b .Loop 126 127.align 5 128.Loop: 129 ldp $t0,$t1,[$inp],#16 // load input 130 sub $len,$len,#16 131#ifdef __ARMEB__ 132 rev $t0,$t0 133 rev $t1,$t1 134#endif 135 adds $h0,$h0,$t0 // accumulate input 136 adcs $h1,$h1,$t1 137 138 mul $d0,$h0,$r0 // h0*r0 139 adc $h2,$h2,$padbit 140 umulh $d1,$h0,$r0 141 142 mul $t0,$h1,$s1 // h1*5*r1 143 umulh $t1,$h1,$s1 144 145 adds $d0,$d0,$t0 146 mul $t0,$h0,$r1 // h0*r1 147 adc $d1,$d1,$t1 148 umulh $d2,$h0,$r1 149 150 adds $d1,$d1,$t0 151 mul $t0,$h1,$r0 // h1*r0 152 adc $d2,$d2,xzr 153 umulh $t1,$h1,$r0 154 155 adds $d1,$d1,$t0 156 mul $t0,$h2,$s1 // h2*5*r1 157 adc $d2,$d2,$t1 158 mul $t1,$h2,$r0 // h2*r0 159 160 adds $d1,$d1,$t0 161 adc $d2,$d2,$t1 162 163 and $t0,$d2,#-4 // final reduction 164 and $h2,$d2,#3 165 add $t0,$t0,$d2,lsr#2 166 adds $h0,$d0,$t0 167 adcs $h1,$d1,xzr 168 adc $h2,$h2,xzr 169 170 cbnz $len,.Loop 171 172 stp $h0,$h1,[$ctx] // store hash value 173 str $h2,[$ctx,#16] 174 175.Lno_data: 176 ret 177.size poly1305_blocks,.-poly1305_blocks 178 179.type poly1305_emit,%function 180.align 5 181poly1305_emit: 182 ldp $h0,$h1,[$ctx] // load hash base 2^64 183 ldr $h2,[$ctx,#16] 184 ldp $t0,$t1,[$nonce] // load nonce 185 186 adds $d0,$h0,#5 // compare to modulus 187 adcs $d1,$h1,xzr 188 adc $d2,$h2,xzr 189 190 tst $d2,#-4 // see if it's carried/borrowed 191 192 csel $h0,$h0,$d0,eq 193 csel $h1,$h1,$d1,eq 194 195#ifdef __ARMEB__ 196 ror $t0,$t0,#32 // flip nonce words 197 ror $t1,$t1,#32 198#endif 199 adds $h0,$h0,$t0 // accumulate nonce 200 adc $h1,$h1,$t1 201#ifdef __ARMEB__ 202 rev $h0,$h0 // flip output bytes 203 rev $h1,$h1 204#endif 205 stp $h0,$h1,[$mac] // write result 206 207 ret 208.size poly1305_emit,.-poly1305_emit 209___ 210my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 211my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 212my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 213my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 214my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 215my ($T0,$T1,$MASK) = map("v$_",(29..31)); 216 217my ($in2,$zeros)=("x16","x17"); 218my $is_base2_26 = $zeros; # borrow 219 220$code.=<<___; 221.type poly1305_mult,%function 222.align 5 223poly1305_mult: 224 mul $d0,$h0,$r0 // h0*r0 225 umulh $d1,$h0,$r0 226 227 mul $t0,$h1,$s1 // h1*5*r1 228 umulh $t1,$h1,$s1 229 230 adds $d0,$d0,$t0 231 mul $t0,$h0,$r1 // h0*r1 232 adc $d1,$d1,$t1 233 umulh $d2,$h0,$r1 234 235 adds $d1,$d1,$t0 236 mul $t0,$h1,$r0 // h1*r0 237 adc $d2,$d2,xzr 238 umulh $t1,$h1,$r0 239 240 adds $d1,$d1,$t0 241 mul $t0,$h2,$s1 // h2*5*r1 242 adc $d2,$d2,$t1 243 mul $t1,$h2,$r0 // h2*r0 244 245 adds $d1,$d1,$t0 246 adc $d2,$d2,$t1 247 248 and $t0,$d2,#-4 // final reduction 249 and $h2,$d2,#3 250 add $t0,$t0,$d2,lsr#2 251 adds $h0,$d0,$t0 252 adcs $h1,$d1,xzr 253 adc $h2,$h2,xzr 254 255 ret 256.size poly1305_mult,.-poly1305_mult 257 258.type poly1305_splat,%function 259.align 5 260poly1305_splat: 261 and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 262 ubfx x13,$h0,#26,#26 263 extr x14,$h1,$h0,#52 264 and x14,x14,#0x03ffffff 265 ubfx x15,$h1,#14,#26 266 extr x16,$h2,$h1,#40 267 268 str w12,[$ctx,#16*0] // r0 269 add w12,w13,w13,lsl#2 // r1*5 270 str w13,[$ctx,#16*1] // r1 271 add w13,w14,w14,lsl#2 // r2*5 272 str w12,[$ctx,#16*2] // s1 273 str w14,[$ctx,#16*3] // r2 274 add w14,w15,w15,lsl#2 // r3*5 275 str w13,[$ctx,#16*4] // s2 276 str w15,[$ctx,#16*5] // r3 277 add w15,w16,w16,lsl#2 // r4*5 278 str w14,[$ctx,#16*6] // s3 279 str w16,[$ctx,#16*7] // r4 280 str w15,[$ctx,#16*8] // s4 281 282 ret 283.size poly1305_splat,.-poly1305_splat 284 285.type poly1305_blocks_neon,%function 286.align 5 287poly1305_blocks_neon: 288 ldr $is_base2_26,[$ctx,#24] 289 cmp $len,#128 290 b.hs .Lblocks_neon 291 cbz $is_base2_26,poly1305_blocks 292 293.Lblocks_neon: 294 .inst 0xd503233f // paciasp 295 stp x29,x30,[sp,#-80]! 296 add x29,sp,#0 297 298 ands $len,$len,#-16 299 b.eq .Lno_data_neon 300 301 cbz $is_base2_26,.Lbase2_64_neon 302 303 ldp w10,w11,[$ctx] // load hash value base 2^26 304 ldp w12,w13,[$ctx,#8] 305 ldr w14,[$ctx,#16] 306 307 tst $len,#31 308 b.eq .Leven_neon 309 310 ldp $r0,$r1,[$ctx,#32] // load key value 311 312 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 313 lsr $h1,x12,#12 314 adds $h0,$h0,x12,lsl#52 315 add $h1,$h1,x13,lsl#14 316 adc $h1,$h1,xzr 317 lsr $h2,x14,#24 318 adds $h1,$h1,x14,lsl#40 319 adc $d2,$h2,xzr // can be partially reduced... 320 321 ldp $d0,$d1,[$inp],#16 // load input 322 sub $len,$len,#16 323 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 324 325 and $t0,$d2,#-4 // ... so reduce 326 and $h2,$d2,#3 327 add $t0,$t0,$d2,lsr#2 328 adds $h0,$h0,$t0 329 adcs $h1,$h1,xzr 330 adc $h2,$h2,xzr 331 332#ifdef __ARMEB__ 333 rev $d0,$d0 334 rev $d1,$d1 335#endif 336 adds $h0,$h0,$d0 // accumulate input 337 adcs $h1,$h1,$d1 338 adc $h2,$h2,$padbit 339 340 bl poly1305_mult 341 ldr x30,[sp,#8] 342 343 cbz $padbit,.Lstore_base2_64_neon 344 345 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 346 ubfx x11,$h0,#26,#26 347 extr x12,$h1,$h0,#52 348 and x12,x12,#0x03ffffff 349 ubfx x13,$h1,#14,#26 350 extr x14,$h2,$h1,#40 351 352 cbnz $len,.Leven_neon 353 354 stp w10,w11,[$ctx] // store hash value base 2^26 355 stp w12,w13,[$ctx,#8] 356 str w14,[$ctx,#16] 357 b .Lno_data_neon 358 359.align 4 360.Lstore_base2_64_neon: 361 stp $h0,$h1,[$ctx] // store hash value base 2^64 362 stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed 363 b .Lno_data_neon 364 365.align 4 366.Lbase2_64_neon: 367 ldp $r0,$r1,[$ctx,#32] // load key value 368 369 ldp $h0,$h1,[$ctx] // load hash value base 2^64 370 ldr $h2,[$ctx,#16] 371 372 tst $len,#31 373 b.eq .Linit_neon 374 375 ldp $d0,$d1,[$inp],#16 // load input 376 sub $len,$len,#16 377 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 378#ifdef __ARMEB__ 379 rev $d0,$d0 380 rev $d1,$d1 381#endif 382 adds $h0,$h0,$d0 // accumulate input 383 adcs $h1,$h1,$d1 384 adc $h2,$h2,$padbit 385 386 bl poly1305_mult 387 388.Linit_neon: 389 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 390 ubfx x11,$h0,#26,#26 391 extr x12,$h1,$h0,#52 392 and x12,x12,#0x03ffffff 393 ubfx x13,$h1,#14,#26 394 extr x14,$h2,$h1,#40 395 396 stp d8,d9,[sp,#16] // meet ABI requirements 397 stp d10,d11,[sp,#32] 398 stp d12,d13,[sp,#48] 399 stp d14,d15,[sp,#64] 400 401 fmov ${H0},x10 402 fmov ${H1},x11 403 fmov ${H2},x12 404 fmov ${H3},x13 405 fmov ${H4},x14 406 407 ////////////////////////////////// initialize r^n table 408 mov $h0,$r0 // r^1 409 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 410 mov $h1,$r1 411 mov $h2,xzr 412 add $ctx,$ctx,#48+12 413 bl poly1305_splat 414 415 bl poly1305_mult // r^2 416 sub $ctx,$ctx,#4 417 bl poly1305_splat 418 419 bl poly1305_mult // r^3 420 sub $ctx,$ctx,#4 421 bl poly1305_splat 422 423 bl poly1305_mult // r^4 424 sub $ctx,$ctx,#4 425 bl poly1305_splat 426 ldr x30,[sp,#8] 427 428 add $in2,$inp,#32 429 adr $zeros,.Lzeros 430 subs $len,$len,#64 431 csel $in2,$zeros,$in2,lo 432 433 mov x4,#1 434 str x4,[$ctx,#-24] // set is_base2_26 435 sub $ctx,$ctx,#48 // restore original $ctx 436 b .Ldo_neon 437 438.align 4 439.Leven_neon: 440 add $in2,$inp,#32 441 adr $zeros,.Lzeros 442 subs $len,$len,#64 443 csel $in2,$zeros,$in2,lo 444 445 stp d8,d9,[sp,#16] // meet ABI requirements 446 stp d10,d11,[sp,#32] 447 stp d12,d13,[sp,#48] 448 stp d14,d15,[sp,#64] 449 450 fmov ${H0},x10 451 fmov ${H1},x11 452 fmov ${H2},x12 453 fmov ${H3},x13 454 fmov ${H4},x14 455 456.Ldo_neon: 457 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 458 ldp x9,x13,[$in2],#48 459 460 lsl $padbit,$padbit,#24 461 add x15,$ctx,#48 462 463#ifdef __ARMEB__ 464 rev x8,x8 465 rev x12,x12 466 rev x9,x9 467 rev x13,x13 468#endif 469 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 470 and x5,x9,#0x03ffffff 471 ubfx x6,x8,#26,#26 472 ubfx x7,x9,#26,#26 473 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 474 extr x8,x12,x8,#52 475 extr x9,x13,x9,#52 476 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 477 fmov $IN23_0,x4 478 and x8,x8,#0x03ffffff 479 and x9,x9,#0x03ffffff 480 ubfx x10,x12,#14,#26 481 ubfx x11,x13,#14,#26 482 add x12,$padbit,x12,lsr#40 483 add x13,$padbit,x13,lsr#40 484 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 485 fmov $IN23_1,x6 486 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 487 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 488 fmov $IN23_2,x8 489 fmov $IN23_3,x10 490 fmov $IN23_4,x12 491 492 ldp x8,x12,[$inp],#16 // inp[0:1] 493 ldp x9,x13,[$inp],#48 494 495 ld1 {$R0,$R1,$S1,$R2},[x15],#64 496 ld1 {$S2,$R3,$S3,$R4},[x15],#64 497 ld1 {$S4},[x15] 498 499#ifdef __ARMEB__ 500 rev x8,x8 501 rev x12,x12 502 rev x9,x9 503 rev x13,x13 504#endif 505 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 506 and x5,x9,#0x03ffffff 507 ubfx x6,x8,#26,#26 508 ubfx x7,x9,#26,#26 509 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 510 extr x8,x12,x8,#52 511 extr x9,x13,x9,#52 512 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 513 fmov $IN01_0,x4 514 and x8,x8,#0x03ffffff 515 and x9,x9,#0x03ffffff 516 ubfx x10,x12,#14,#26 517 ubfx x11,x13,#14,#26 518 add x12,$padbit,x12,lsr#40 519 add x13,$padbit,x13,lsr#40 520 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 521 fmov $IN01_1,x6 522 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 523 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 524 movi $MASK.2d,#-1 525 fmov $IN01_2,x8 526 fmov $IN01_3,x10 527 fmov $IN01_4,x12 528 ushr $MASK.2d,$MASK.2d,#38 529 530 b.ls .Lskip_loop 531 532.align 4 533.Loop_neon: 534 //////////////////////////////////////////////////////////////// 535 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 536 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 537 // \___________________/ 538 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 539 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 540 // \___________________/ \____________________/ 541 // 542 // Note that we start with inp[2:3]*r^2. This is because it 543 // doesn't depend on reduction in previous iteration. 544 //////////////////////////////////////////////////////////////// 545 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 546 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 547 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 548 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 549 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 550 551 subs $len,$len,#64 552 umull $ACC4,$IN23_0,${R4}[2] 553 csel $in2,$zeros,$in2,lo 554 umull $ACC3,$IN23_0,${R3}[2] 555 umull $ACC2,$IN23_0,${R2}[2] 556 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 557 umull $ACC1,$IN23_0,${R1}[2] 558 ldp x9,x13,[$in2],#48 559 umull $ACC0,$IN23_0,${R0}[2] 560#ifdef __ARMEB__ 561 rev x8,x8 562 rev x12,x12 563 rev x9,x9 564 rev x13,x13 565#endif 566 567 umlal $ACC4,$IN23_1,${R3}[2] 568 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 569 umlal $ACC3,$IN23_1,${R2}[2] 570 and x5,x9,#0x03ffffff 571 umlal $ACC2,$IN23_1,${R1}[2] 572 ubfx x6,x8,#26,#26 573 umlal $ACC1,$IN23_1,${R0}[2] 574 ubfx x7,x9,#26,#26 575 umlal $ACC0,$IN23_1,${S4}[2] 576 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 577 578 umlal $ACC4,$IN23_2,${R2}[2] 579 extr x8,x12,x8,#52 580 umlal $ACC3,$IN23_2,${R1}[2] 581 extr x9,x13,x9,#52 582 umlal $ACC2,$IN23_2,${R0}[2] 583 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 584 umlal $ACC1,$IN23_2,${S4}[2] 585 fmov $IN23_0,x4 586 umlal $ACC0,$IN23_2,${S3}[2] 587 and x8,x8,#0x03ffffff 588 589 umlal $ACC4,$IN23_3,${R1}[2] 590 and x9,x9,#0x03ffffff 591 umlal $ACC3,$IN23_3,${R0}[2] 592 ubfx x10,x12,#14,#26 593 umlal $ACC2,$IN23_3,${S4}[2] 594 ubfx x11,x13,#14,#26 595 umlal $ACC1,$IN23_3,${S3}[2] 596 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 597 umlal $ACC0,$IN23_3,${S2}[2] 598 fmov $IN23_1,x6 599 600 add $IN01_2,$IN01_2,$H2 601 add x12,$padbit,x12,lsr#40 602 umlal $ACC4,$IN23_4,${R0}[2] 603 add x13,$padbit,x13,lsr#40 604 umlal $ACC3,$IN23_4,${S4}[2] 605 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 606 umlal $ACC2,$IN23_4,${S3}[2] 607 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 608 umlal $ACC1,$IN23_4,${S2}[2] 609 fmov $IN23_2,x8 610 umlal $ACC0,$IN23_4,${S1}[2] 611 fmov $IN23_3,x10 612 613 //////////////////////////////////////////////////////////////// 614 // (hash+inp[0:1])*r^4 and accumulate 615 616 add $IN01_0,$IN01_0,$H0 617 fmov $IN23_4,x12 618 umlal $ACC3,$IN01_2,${R1}[0] 619 ldp x8,x12,[$inp],#16 // inp[0:1] 620 umlal $ACC0,$IN01_2,${S3}[0] 621 ldp x9,x13,[$inp],#48 622 umlal $ACC4,$IN01_2,${R2}[0] 623 umlal $ACC1,$IN01_2,${S4}[0] 624 umlal $ACC2,$IN01_2,${R0}[0] 625#ifdef __ARMEB__ 626 rev x8,x8 627 rev x12,x12 628 rev x9,x9 629 rev x13,x13 630#endif 631 632 add $IN01_1,$IN01_1,$H1 633 umlal $ACC3,$IN01_0,${R3}[0] 634 umlal $ACC4,$IN01_0,${R4}[0] 635 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 636 umlal $ACC2,$IN01_0,${R2}[0] 637 and x5,x9,#0x03ffffff 638 umlal $ACC0,$IN01_0,${R0}[0] 639 ubfx x6,x8,#26,#26 640 umlal $ACC1,$IN01_0,${R1}[0] 641 ubfx x7,x9,#26,#26 642 643 add $IN01_3,$IN01_3,$H3 644 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 645 umlal $ACC3,$IN01_1,${R2}[0] 646 extr x8,x12,x8,#52 647 umlal $ACC4,$IN01_1,${R3}[0] 648 extr x9,x13,x9,#52 649 umlal $ACC0,$IN01_1,${S4}[0] 650 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 651 umlal $ACC2,$IN01_1,${R1}[0] 652 fmov $IN01_0,x4 653 umlal $ACC1,$IN01_1,${R0}[0] 654 and x8,x8,#0x03ffffff 655 656 add $IN01_4,$IN01_4,$H4 657 and x9,x9,#0x03ffffff 658 umlal $ACC3,$IN01_3,${R0}[0] 659 ubfx x10,x12,#14,#26 660 umlal $ACC0,$IN01_3,${S2}[0] 661 ubfx x11,x13,#14,#26 662 umlal $ACC4,$IN01_3,${R1}[0] 663 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 664 umlal $ACC1,$IN01_3,${S3}[0] 665 fmov $IN01_1,x6 666 umlal $ACC2,$IN01_3,${S4}[0] 667 add x12,$padbit,x12,lsr#40 668 669 umlal $ACC3,$IN01_4,${S4}[0] 670 add x13,$padbit,x13,lsr#40 671 umlal $ACC0,$IN01_4,${S1}[0] 672 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 673 umlal $ACC4,$IN01_4,${R0}[0] 674 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 675 umlal $ACC1,$IN01_4,${S2}[0] 676 fmov $IN01_2,x8 677 umlal $ACC2,$IN01_4,${S3}[0] 678 fmov $IN01_3,x10 679 fmov $IN01_4,x12 680 681 ///////////////////////////////////////////////////////////////// 682 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 683 // and P. Schwabe 684 // 685 // [see discussion in poly1305-armv4 module] 686 687 ushr $T0.2d,$ACC3,#26 688 xtn $H3,$ACC3 689 ushr $T1.2d,$ACC0,#26 690 and $ACC0,$ACC0,$MASK.2d 691 add $ACC4,$ACC4,$T0.2d // h3 -> h4 692 bic $H3,#0xfc,lsl#24 // &=0x03ffffff 693 add $ACC1,$ACC1,$T1.2d // h0 -> h1 694 695 ushr $T0.2d,$ACC4,#26 696 xtn $H4,$ACC4 697 ushr $T1.2d,$ACC1,#26 698 xtn $H1,$ACC1 699 bic $H4,#0xfc,lsl#24 700 add $ACC2,$ACC2,$T1.2d // h1 -> h2 701 702 add $ACC0,$ACC0,$T0.2d 703 shl $T0.2d,$T0.2d,#2 704 shrn $T1.2s,$ACC2,#26 705 xtn $H2,$ACC2 706 add $ACC0,$ACC0,$T0.2d // h4 -> h0 707 bic $H1,#0xfc,lsl#24 708 add $H3,$H3,$T1.2s // h2 -> h3 709 bic $H2,#0xfc,lsl#24 710 711 shrn $T0.2s,$ACC0,#26 712 xtn $H0,$ACC0 713 ushr $T1.2s,$H3,#26 714 bic $H3,#0xfc,lsl#24 715 bic $H0,#0xfc,lsl#24 716 add $H1,$H1,$T0.2s // h0 -> h1 717 add $H4,$H4,$T1.2s // h3 -> h4 718 719 b.hi .Loop_neon 720 721.Lskip_loop: 722 dup $IN23_2,${IN23_2}[0] 723 add $IN01_2,$IN01_2,$H2 724 725 //////////////////////////////////////////////////////////////// 726 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 727 728 adds $len,$len,#32 729 b.ne .Long_tail 730 731 dup $IN23_2,${IN01_2}[0] 732 add $IN23_0,$IN01_0,$H0 733 add $IN23_3,$IN01_3,$H3 734 add $IN23_1,$IN01_1,$H1 735 add $IN23_4,$IN01_4,$H4 736 737.Long_tail: 738 dup $IN23_0,${IN23_0}[0] 739 umull2 $ACC0,$IN23_2,${S3} 740 umull2 $ACC3,$IN23_2,${R1} 741 umull2 $ACC4,$IN23_2,${R2} 742 umull2 $ACC2,$IN23_2,${R0} 743 umull2 $ACC1,$IN23_2,${S4} 744 745 dup $IN23_1,${IN23_1}[0] 746 umlal2 $ACC0,$IN23_0,${R0} 747 umlal2 $ACC2,$IN23_0,${R2} 748 umlal2 $ACC3,$IN23_0,${R3} 749 umlal2 $ACC4,$IN23_0,${R4} 750 umlal2 $ACC1,$IN23_0,${R1} 751 752 dup $IN23_3,${IN23_3}[0] 753 umlal2 $ACC0,$IN23_1,${S4} 754 umlal2 $ACC3,$IN23_1,${R2} 755 umlal2 $ACC2,$IN23_1,${R1} 756 umlal2 $ACC4,$IN23_1,${R3} 757 umlal2 $ACC1,$IN23_1,${R0} 758 759 dup $IN23_4,${IN23_4}[0] 760 umlal2 $ACC3,$IN23_3,${R0} 761 umlal2 $ACC4,$IN23_3,${R1} 762 umlal2 $ACC0,$IN23_3,${S2} 763 umlal2 $ACC1,$IN23_3,${S3} 764 umlal2 $ACC2,$IN23_3,${S4} 765 766 umlal2 $ACC3,$IN23_4,${S4} 767 umlal2 $ACC0,$IN23_4,${S1} 768 umlal2 $ACC4,$IN23_4,${R0} 769 umlal2 $ACC1,$IN23_4,${S2} 770 umlal2 $ACC2,$IN23_4,${S3} 771 772 b.eq .Lshort_tail 773 774 //////////////////////////////////////////////////////////////// 775 // (hash+inp[0:1])*r^4:r^3 and accumulate 776 777 add $IN01_0,$IN01_0,$H0 778 umlal $ACC3,$IN01_2,${R1} 779 umlal $ACC0,$IN01_2,${S3} 780 umlal $ACC4,$IN01_2,${R2} 781 umlal $ACC1,$IN01_2,${S4} 782 umlal $ACC2,$IN01_2,${R0} 783 784 add $IN01_1,$IN01_1,$H1 785 umlal $ACC3,$IN01_0,${R3} 786 umlal $ACC0,$IN01_0,${R0} 787 umlal $ACC4,$IN01_0,${R4} 788 umlal $ACC1,$IN01_0,${R1} 789 umlal $ACC2,$IN01_0,${R2} 790 791 add $IN01_3,$IN01_3,$H3 792 umlal $ACC3,$IN01_1,${R2} 793 umlal $ACC0,$IN01_1,${S4} 794 umlal $ACC4,$IN01_1,${R3} 795 umlal $ACC1,$IN01_1,${R0} 796 umlal $ACC2,$IN01_1,${R1} 797 798 add $IN01_4,$IN01_4,$H4 799 umlal $ACC3,$IN01_3,${R0} 800 umlal $ACC0,$IN01_3,${S2} 801 umlal $ACC4,$IN01_3,${R1} 802 umlal $ACC1,$IN01_3,${S3} 803 umlal $ACC2,$IN01_3,${S4} 804 805 umlal $ACC3,$IN01_4,${S4} 806 umlal $ACC0,$IN01_4,${S1} 807 umlal $ACC4,$IN01_4,${R0} 808 umlal $ACC1,$IN01_4,${S2} 809 umlal $ACC2,$IN01_4,${S3} 810 811.Lshort_tail: 812 //////////////////////////////////////////////////////////////// 813 // horizontal add 814 815 addp $ACC3,$ACC3,$ACC3 816 ldp d8,d9,[sp,#16] // meet ABI requirements 817 addp $ACC0,$ACC0,$ACC0 818 ldp d10,d11,[sp,#32] 819 addp $ACC4,$ACC4,$ACC4 820 ldp d12,d13,[sp,#48] 821 addp $ACC1,$ACC1,$ACC1 822 ldp d14,d15,[sp,#64] 823 addp $ACC2,$ACC2,$ACC2 824 825 //////////////////////////////////////////////////////////////// 826 // lazy reduction, but without narrowing 827 828 ushr $T0.2d,$ACC3,#26 829 and $ACC3,$ACC3,$MASK.2d 830 ushr $T1.2d,$ACC0,#26 831 and $ACC0,$ACC0,$MASK.2d 832 833 add $ACC4,$ACC4,$T0.2d // h3 -> h4 834 add $ACC1,$ACC1,$T1.2d // h0 -> h1 835 836 ushr $T0.2d,$ACC4,#26 837 and $ACC4,$ACC4,$MASK.2d 838 ushr $T1.2d,$ACC1,#26 839 and $ACC1,$ACC1,$MASK.2d 840 add $ACC2,$ACC2,$T1.2d // h1 -> h2 841 842 add $ACC0,$ACC0,$T0.2d 843 shl $T0.2d,$T0.2d,#2 844 ushr $T1.2d,$ACC2,#26 845 and $ACC2,$ACC2,$MASK.2d 846 add $ACC0,$ACC0,$T0.2d // h4 -> h0 847 add $ACC3,$ACC3,$T1.2d // h2 -> h3 848 849 ushr $T0.2d,$ACC0,#26 850 and $ACC0,$ACC0,$MASK.2d 851 ushr $T1.2d,$ACC3,#26 852 and $ACC3,$ACC3,$MASK.2d 853 add $ACC1,$ACC1,$T0.2d // h0 -> h1 854 add $ACC4,$ACC4,$T1.2d // h3 -> h4 855 856 //////////////////////////////////////////////////////////////// 857 // write the result, can be partially reduced 858 859 st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 860 st1 {$ACC4}[0],[$ctx] 861 862.Lno_data_neon: 863 .inst 0xd50323bf // autiasp 864 ldr x29,[sp],#80 865 ret 866.size poly1305_blocks_neon,.-poly1305_blocks_neon 867 868.type poly1305_emit_neon,%function 869.align 5 870poly1305_emit_neon: 871 ldr $is_base2_26,[$ctx,#24] 872 cbz $is_base2_26,poly1305_emit 873 874 ldp w10,w11,[$ctx] // load hash value base 2^26 875 ldp w12,w13,[$ctx,#8] 876 ldr w14,[$ctx,#16] 877 878 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 879 lsr $h1,x12,#12 880 adds $h0,$h0,x12,lsl#52 881 add $h1,$h1,x13,lsl#14 882 adc $h1,$h1,xzr 883 lsr $h2,x14,#24 884 adds $h1,$h1,x14,lsl#40 885 adc $h2,$h2,xzr // can be partially reduced... 886 887 ldp $t0,$t1,[$nonce] // load nonce 888 889 and $d0,$h2,#-4 // ... so reduce 890 add $d0,$d0,$h2,lsr#2 891 and $h2,$h2,#3 892 adds $h0,$h0,$d0 893 adcs $h1,$h1,xzr 894 adc $h2,$h2,xzr 895 896 adds $d0,$h0,#5 // compare to modulus 897 adcs $d1,$h1,xzr 898 adc $d2,$h2,xzr 899 900 tst $d2,#-4 // see if it's carried/borrowed 901 902 csel $h0,$h0,$d0,eq 903 csel $h1,$h1,$d1,eq 904 905#ifdef __ARMEB__ 906 ror $t0,$t0,#32 // flip nonce words 907 ror $t1,$t1,#32 908#endif 909 adds $h0,$h0,$t0 // accumulate nonce 910 adc $h1,$h1,$t1 911#ifdef __ARMEB__ 912 rev $h0,$h0 // flip output bytes 913 rev $h1,$h1 914#endif 915 stp $h0,$h1,[$mac] // write result 916 917 ret 918.size poly1305_emit_neon,.-poly1305_emit_neon 919 920.align 5 921.Lzeros: 922.long 0,0,0,0,0,0,0,0 923.LOPENSSL_armcap_P: 924#ifdef __ILP32__ 925.long OPENSSL_armcap_P-. 926#else 927.quad OPENSSL_armcap_P-. 928#endif 929.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 930.align 2 931___ 932 933foreach (split("\n",$code)) { 934 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 935 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 936 (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 937 (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 938 (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 939 (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 940 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 941 942 s/\.[124]([sd])\[/.$1\[/; 943 944 print $_,"\n"; 945} 946close STDOUT or die "error closing STDOUT: $!"; 947