1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for ARMv8. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone. 22# 23# IALU/gcc-4.9 NEON 24# 25# Apple A7 1.86/+5% 0.72 26# Cortex-A53 2.69/+58% 1.47 27# Cortex-A57 2.70/+7% 1.14 28# Denver 1.64/+50% 1.18(*) 29# X-Gene 2.13/+68% 2.27 30# Mongoose 1.77/+75% 1.12 31# Kryo 2.70/+55% 1.13 32# ThunderX2 1.17/+95% 1.36 33# 34# (*) estimate based on resources availability is less than 1.0, 35# i.e. measured result is worse than expected, presumably binary 36# translator is not almighty; 37 38# $output is the last argument if it looks like a file (it has an extension) 39# $flavour is the first argument if it doesn't look like a file 40$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 41$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 42 43$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 44( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 45( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 46die "can't locate arm-xlate.pl"; 47 48open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 49 or die "can't call $xlate: $!"; 50*STDOUT=*OUT; 51 52my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 53my ($mac,$nonce)=($inp,$len); 54 55my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 56 57$code.=<<___; 58#include "arm_arch.h" 59 60.text 61 62// forward "declarations" are required for Apple 63.extern OPENSSL_armcap_P 64.hidden OPENSSL_armcap_P 65.globl poly1305_init 66.hidden poly1305_init 67.globl poly1305_blocks 68.hidden poly1305_blocks 69.globl poly1305_emit 70.hidden poly1305_emit 71 72.type poly1305_init,%function 73.align 5 74poly1305_init: 75 cmp $inp,xzr 76 stp xzr,xzr,[$ctx] // zero hash value 77 stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 78 79 csel x0,xzr,x0,eq 80 b.eq .Lno_key 81 82 adrp x17,OPENSSL_armcap_P 83 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 84 85 ldp $r0,$r1,[$inp] // load key 86 mov $s1,#0xfffffffc0fffffff 87 movk $s1,#0x0fff,lsl#48 88#ifdef __ARMEB__ 89 rev $r0,$r0 // flip bytes 90 rev $r1,$r1 91#endif 92 and $r0,$r0,$s1 // &=0ffffffc0fffffff 93 and $s1,$s1,#-4 94 and $r1,$r1,$s1 // &=0ffffffc0ffffffc 95 stp $r0,$r1,[$ctx,#32] // save key value 96 97 tst w17,#ARMV7_NEON 98 99 adr $d0,.Lpoly1305_blocks 100 adr $r0,.Lpoly1305_blocks_neon 101 adr $d1,.Lpoly1305_emit 102 adr $r1,.Lpoly1305_emit_neon 103 104 csel $d0,$d0,$r0,eq 105 csel $d1,$d1,$r1,eq 106 107#ifdef __ILP32__ 108 stp w12,w13,[$len] 109#else 110 stp $d0,$d1,[$len] 111#endif 112 113 mov x0,#1 114.Lno_key: 115 ret 116.size poly1305_init,.-poly1305_init 117 118.type poly1305_blocks,%function 119.align 5 120poly1305_blocks: 121.Lpoly1305_blocks: 122 ands $len,$len,#-16 123 b.eq .Lno_data 124 125 ldp $h0,$h1,[$ctx] // load hash value 126 ldp $r0,$r1,[$ctx,#32] // load key value 127 ldr $h2,[$ctx,#16] 128 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 129 b .Loop 130 131.align 5 132.Loop: 133 ldp $t0,$t1,[$inp],#16 // load input 134 sub $len,$len,#16 135#ifdef __ARMEB__ 136 rev $t0,$t0 137 rev $t1,$t1 138#endif 139 adds $h0,$h0,$t0 // accumulate input 140 adcs $h1,$h1,$t1 141 142 mul $d0,$h0,$r0 // h0*r0 143 adc $h2,$h2,$padbit 144 umulh $d1,$h0,$r0 145 146 mul $t0,$h1,$s1 // h1*5*r1 147 umulh $t1,$h1,$s1 148 149 adds $d0,$d0,$t0 150 mul $t0,$h0,$r1 // h0*r1 151 adc $d1,$d1,$t1 152 umulh $d2,$h0,$r1 153 154 adds $d1,$d1,$t0 155 mul $t0,$h1,$r0 // h1*r0 156 adc $d2,$d2,xzr 157 umulh $t1,$h1,$r0 158 159 adds $d1,$d1,$t0 160 mul $t0,$h2,$s1 // h2*5*r1 161 adc $d2,$d2,$t1 162 mul $t1,$h2,$r0 // h2*r0 163 164 adds $d1,$d1,$t0 165 adc $d2,$d2,$t1 166 167 and $t0,$d2,#-4 // final reduction 168 and $h2,$d2,#3 169 add $t0,$t0,$d2,lsr#2 170 adds $h0,$d0,$t0 171 adcs $h1,$d1,xzr 172 adc $h2,$h2,xzr 173 174 cbnz $len,.Loop 175 176 stp $h0,$h1,[$ctx] // store hash value 177 str $h2,[$ctx,#16] 178 179.Lno_data: 180 ret 181.size poly1305_blocks,.-poly1305_blocks 182 183.type poly1305_emit,%function 184.align 5 185poly1305_emit: 186.Lpoly1305_emit: 187 ldp $h0,$h1,[$ctx] // load hash base 2^64 188 ldr $h2,[$ctx,#16] 189 ldp $t0,$t1,[$nonce] // load nonce 190 191 adds $d0,$h0,#5 // compare to modulus 192 adcs $d1,$h1,xzr 193 adc $d2,$h2,xzr 194 195 tst $d2,#-4 // see if it's carried/borrowed 196 197 csel $h0,$h0,$d0,eq 198 csel $h1,$h1,$d1,eq 199 200#ifdef __ARMEB__ 201 ror $t0,$t0,#32 // flip nonce words 202 ror $t1,$t1,#32 203#endif 204 adds $h0,$h0,$t0 // accumulate nonce 205 adc $h1,$h1,$t1 206#ifdef __ARMEB__ 207 rev $h0,$h0 // flip output bytes 208 rev $h1,$h1 209#endif 210 stp $h0,$h1,[$mac] // write result 211 212 ret 213.size poly1305_emit,.-poly1305_emit 214___ 215my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 216my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 217my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 218my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 219my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 220my ($T0,$T1,$MASK) = map("v$_",(29..31)); 221 222my ($in2,$zeros)=("x16","x17"); 223my $is_base2_26 = $zeros; # borrow 224 225$code.=<<___; 226.type poly1305_mult,%function 227.align 5 228poly1305_mult: 229 mul $d0,$h0,$r0 // h0*r0 230 umulh $d1,$h0,$r0 231 232 mul $t0,$h1,$s1 // h1*5*r1 233 umulh $t1,$h1,$s1 234 235 adds $d0,$d0,$t0 236 mul $t0,$h0,$r1 // h0*r1 237 adc $d1,$d1,$t1 238 umulh $d2,$h0,$r1 239 240 adds $d1,$d1,$t0 241 mul $t0,$h1,$r0 // h1*r0 242 adc $d2,$d2,xzr 243 umulh $t1,$h1,$r0 244 245 adds $d1,$d1,$t0 246 mul $t0,$h2,$s1 // h2*5*r1 247 adc $d2,$d2,$t1 248 mul $t1,$h2,$r0 // h2*r0 249 250 adds $d1,$d1,$t0 251 adc $d2,$d2,$t1 252 253 and $t0,$d2,#-4 // final reduction 254 and $h2,$d2,#3 255 add $t0,$t0,$d2,lsr#2 256 adds $h0,$d0,$t0 257 adcs $h1,$d1,xzr 258 adc $h2,$h2,xzr 259 260 ret 261.size poly1305_mult,.-poly1305_mult 262 263.type poly1305_splat,%function 264.align 5 265poly1305_splat: 266 and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 267 ubfx x13,$h0,#26,#26 268 extr x14,$h1,$h0,#52 269 and x14,x14,#0x03ffffff 270 ubfx x15,$h1,#14,#26 271 extr x16,$h2,$h1,#40 272 273 str w12,[$ctx,#16*0] // r0 274 add w12,w13,w13,lsl#2 // r1*5 275 str w13,[$ctx,#16*1] // r1 276 add w13,w14,w14,lsl#2 // r2*5 277 str w12,[$ctx,#16*2] // s1 278 str w14,[$ctx,#16*3] // r2 279 add w14,w15,w15,lsl#2 // r3*5 280 str w13,[$ctx,#16*4] // s2 281 str w15,[$ctx,#16*5] // r3 282 add w15,w16,w16,lsl#2 // r4*5 283 str w14,[$ctx,#16*6] // s3 284 str w16,[$ctx,#16*7] // r4 285 str w15,[$ctx,#16*8] // s4 286 287 ret 288.size poly1305_splat,.-poly1305_splat 289 290.type poly1305_blocks_neon,%function 291.align 5 292poly1305_blocks_neon: 293.Lpoly1305_blocks_neon: 294 ldr $is_base2_26,[$ctx,#24] 295 cmp $len,#128 296 b.hs .Lblocks_neon 297 cbz $is_base2_26,.Lpoly1305_blocks 298 299.Lblocks_neon: 300 .inst 0xd503233f // paciasp 301 stp x29,x30,[sp,#-80]! 302 add x29,sp,#0 303 304 ands $len,$len,#-16 305 b.eq .Lno_data_neon 306 307 cbz $is_base2_26,.Lbase2_64_neon 308 309 ldp w10,w11,[$ctx] // load hash value base 2^26 310 ldp w12,w13,[$ctx,#8] 311 ldr w14,[$ctx,#16] 312 313 tst $len,#31 314 b.eq .Leven_neon 315 316 ldp $r0,$r1,[$ctx,#32] // load key value 317 318 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 319 lsr $h1,x12,#12 320 adds $h0,$h0,x12,lsl#52 321 add $h1,$h1,x13,lsl#14 322 adc $h1,$h1,xzr 323 lsr $h2,x14,#24 324 adds $h1,$h1,x14,lsl#40 325 adc $d2,$h2,xzr // can be partially reduced... 326 327 ldp $d0,$d1,[$inp],#16 // load input 328 sub $len,$len,#16 329 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 330 331 and $t0,$d2,#-4 // ... so reduce 332 and $h2,$d2,#3 333 add $t0,$t0,$d2,lsr#2 334 adds $h0,$h0,$t0 335 adcs $h1,$h1,xzr 336 adc $h2,$h2,xzr 337 338#ifdef __ARMEB__ 339 rev $d0,$d0 340 rev $d1,$d1 341#endif 342 adds $h0,$h0,$d0 // accumulate input 343 adcs $h1,$h1,$d1 344 adc $h2,$h2,$padbit 345 346 bl poly1305_mult 347 ldr x30,[sp,#8] 348 349 cbz $padbit,.Lstore_base2_64_neon 350 351 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 352 ubfx x11,$h0,#26,#26 353 extr x12,$h1,$h0,#52 354 and x12,x12,#0x03ffffff 355 ubfx x13,$h1,#14,#26 356 extr x14,$h2,$h1,#40 357 358 cbnz $len,.Leven_neon 359 360 stp w10,w11,[$ctx] // store hash value base 2^26 361 stp w12,w13,[$ctx,#8] 362 str w14,[$ctx,#16] 363 b .Lno_data_neon 364 365.align 4 366.Lstore_base2_64_neon: 367 stp $h0,$h1,[$ctx] // store hash value base 2^64 368 stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed 369 b .Lno_data_neon 370 371.align 4 372.Lbase2_64_neon: 373 ldp $r0,$r1,[$ctx,#32] // load key value 374 375 ldp $h0,$h1,[$ctx] // load hash value base 2^64 376 ldr $h2,[$ctx,#16] 377 378 tst $len,#31 379 b.eq .Linit_neon 380 381 ldp $d0,$d1,[$inp],#16 // load input 382 sub $len,$len,#16 383 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 384#ifdef __ARMEB__ 385 rev $d0,$d0 386 rev $d1,$d1 387#endif 388 adds $h0,$h0,$d0 // accumulate input 389 adcs $h1,$h1,$d1 390 adc $h2,$h2,$padbit 391 392 bl poly1305_mult 393 394.Linit_neon: 395 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 396 ubfx x11,$h0,#26,#26 397 extr x12,$h1,$h0,#52 398 and x12,x12,#0x03ffffff 399 ubfx x13,$h1,#14,#26 400 extr x14,$h2,$h1,#40 401 402 stp d8,d9,[sp,#16] // meet ABI requirements 403 stp d10,d11,[sp,#32] 404 stp d12,d13,[sp,#48] 405 stp d14,d15,[sp,#64] 406 407 fmov ${H0},x10 408 fmov ${H1},x11 409 fmov ${H2},x12 410 fmov ${H3},x13 411 fmov ${H4},x14 412 413 ////////////////////////////////// initialize r^n table 414 mov $h0,$r0 // r^1 415 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 416 mov $h1,$r1 417 mov $h2,xzr 418 add $ctx,$ctx,#48+12 419 bl poly1305_splat 420 421 bl poly1305_mult // r^2 422 sub $ctx,$ctx,#4 423 bl poly1305_splat 424 425 bl poly1305_mult // r^3 426 sub $ctx,$ctx,#4 427 bl poly1305_splat 428 429 bl poly1305_mult // r^4 430 sub $ctx,$ctx,#4 431 bl poly1305_splat 432 ldr x30,[sp,#8] 433 434 add $in2,$inp,#32 435 adr $zeros,.Lzeros 436 subs $len,$len,#64 437 csel $in2,$zeros,$in2,lo 438 439 mov x4,#1 440 stur x4,[$ctx,#-24] // set is_base2_26 441 sub $ctx,$ctx,#48 // restore original $ctx 442 b .Ldo_neon 443 444.align 4 445.Leven_neon: 446 add $in2,$inp,#32 447 adr $zeros,.Lzeros 448 subs $len,$len,#64 449 csel $in2,$zeros,$in2,lo 450 451 stp d8,d9,[sp,#16] // meet ABI requirements 452 stp d10,d11,[sp,#32] 453 stp d12,d13,[sp,#48] 454 stp d14,d15,[sp,#64] 455 456 fmov ${H0},x10 457 fmov ${H1},x11 458 fmov ${H2},x12 459 fmov ${H3},x13 460 fmov ${H4},x14 461 462.Ldo_neon: 463 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 464 ldp x9,x13,[$in2],#48 465 466 lsl $padbit,$padbit,#24 467 add x15,$ctx,#48 468 469#ifdef __ARMEB__ 470 rev x8,x8 471 rev x12,x12 472 rev x9,x9 473 rev x13,x13 474#endif 475 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 476 and x5,x9,#0x03ffffff 477 ubfx x6,x8,#26,#26 478 ubfx x7,x9,#26,#26 479 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 480 extr x8,x12,x8,#52 481 extr x9,x13,x9,#52 482 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 483 fmov $IN23_0,x4 484 and x8,x8,#0x03ffffff 485 and x9,x9,#0x03ffffff 486 ubfx x10,x12,#14,#26 487 ubfx x11,x13,#14,#26 488 add x12,$padbit,x12,lsr#40 489 add x13,$padbit,x13,lsr#40 490 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 491 fmov $IN23_1,x6 492 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 493 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 494 fmov $IN23_2,x8 495 fmov $IN23_3,x10 496 fmov $IN23_4,x12 497 498 ldp x8,x12,[$inp],#16 // inp[0:1] 499 ldp x9,x13,[$inp],#48 500 501 ld1 {$R0,$R1,$S1,$R2},[x15],#64 502 ld1 {$S2,$R3,$S3,$R4},[x15],#64 503 ld1 {$S4},[x15] 504 505#ifdef __ARMEB__ 506 rev x8,x8 507 rev x12,x12 508 rev x9,x9 509 rev x13,x13 510#endif 511 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 512 and x5,x9,#0x03ffffff 513 ubfx x6,x8,#26,#26 514 ubfx x7,x9,#26,#26 515 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 516 extr x8,x12,x8,#52 517 extr x9,x13,x9,#52 518 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 519 fmov $IN01_0,x4 520 and x8,x8,#0x03ffffff 521 and x9,x9,#0x03ffffff 522 ubfx x10,x12,#14,#26 523 ubfx x11,x13,#14,#26 524 add x12,$padbit,x12,lsr#40 525 add x13,$padbit,x13,lsr#40 526 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 527 fmov $IN01_1,x6 528 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 529 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 530 movi $MASK.2d,#-1 531 fmov $IN01_2,x8 532 fmov $IN01_3,x10 533 fmov $IN01_4,x12 534 ushr $MASK.2d,$MASK.2d,#38 535 536 b.ls .Lskip_loop 537 538.align 4 539.Loop_neon: 540 //////////////////////////////////////////////////////////////// 541 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 542 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 543 // \___________________/ 544 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 545 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 546 // \___________________/ \____________________/ 547 // 548 // Note that we start with inp[2:3]*r^2. This is because it 549 // doesn't depend on reduction in previous iteration. 550 //////////////////////////////////////////////////////////////// 551 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 552 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 553 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 554 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 555 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 556 557 subs $len,$len,#64 558 umull $ACC4,$IN23_0,${R4}[2] 559 csel $in2,$zeros,$in2,lo 560 umull $ACC3,$IN23_0,${R3}[2] 561 umull $ACC2,$IN23_0,${R2}[2] 562 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 563 umull $ACC1,$IN23_0,${R1}[2] 564 ldp x9,x13,[$in2],#48 565 umull $ACC0,$IN23_0,${R0}[2] 566#ifdef __ARMEB__ 567 rev x8,x8 568 rev x12,x12 569 rev x9,x9 570 rev x13,x13 571#endif 572 573 umlal $ACC4,$IN23_1,${R3}[2] 574 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 575 umlal $ACC3,$IN23_1,${R2}[2] 576 and x5,x9,#0x03ffffff 577 umlal $ACC2,$IN23_1,${R1}[2] 578 ubfx x6,x8,#26,#26 579 umlal $ACC1,$IN23_1,${R0}[2] 580 ubfx x7,x9,#26,#26 581 umlal $ACC0,$IN23_1,${S4}[2] 582 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 583 584 umlal $ACC4,$IN23_2,${R2}[2] 585 extr x8,x12,x8,#52 586 umlal $ACC3,$IN23_2,${R1}[2] 587 extr x9,x13,x9,#52 588 umlal $ACC2,$IN23_2,${R0}[2] 589 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 590 umlal $ACC1,$IN23_2,${S4}[2] 591 fmov $IN23_0,x4 592 umlal $ACC0,$IN23_2,${S3}[2] 593 and x8,x8,#0x03ffffff 594 595 umlal $ACC4,$IN23_3,${R1}[2] 596 and x9,x9,#0x03ffffff 597 umlal $ACC3,$IN23_3,${R0}[2] 598 ubfx x10,x12,#14,#26 599 umlal $ACC2,$IN23_3,${S4}[2] 600 ubfx x11,x13,#14,#26 601 umlal $ACC1,$IN23_3,${S3}[2] 602 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 603 umlal $ACC0,$IN23_3,${S2}[2] 604 fmov $IN23_1,x6 605 606 add $IN01_2,$IN01_2,$H2 607 add x12,$padbit,x12,lsr#40 608 umlal $ACC4,$IN23_4,${R0}[2] 609 add x13,$padbit,x13,lsr#40 610 umlal $ACC3,$IN23_4,${S4}[2] 611 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 612 umlal $ACC2,$IN23_4,${S3}[2] 613 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 614 umlal $ACC1,$IN23_4,${S2}[2] 615 fmov $IN23_2,x8 616 umlal $ACC0,$IN23_4,${S1}[2] 617 fmov $IN23_3,x10 618 619 //////////////////////////////////////////////////////////////// 620 // (hash+inp[0:1])*r^4 and accumulate 621 622 add $IN01_0,$IN01_0,$H0 623 fmov $IN23_4,x12 624 umlal $ACC3,$IN01_2,${R1}[0] 625 ldp x8,x12,[$inp],#16 // inp[0:1] 626 umlal $ACC0,$IN01_2,${S3}[0] 627 ldp x9,x13,[$inp],#48 628 umlal $ACC4,$IN01_2,${R2}[0] 629 umlal $ACC1,$IN01_2,${S4}[0] 630 umlal $ACC2,$IN01_2,${R0}[0] 631#ifdef __ARMEB__ 632 rev x8,x8 633 rev x12,x12 634 rev x9,x9 635 rev x13,x13 636#endif 637 638 add $IN01_1,$IN01_1,$H1 639 umlal $ACC3,$IN01_0,${R3}[0] 640 umlal $ACC4,$IN01_0,${R4}[0] 641 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 642 umlal $ACC2,$IN01_0,${R2}[0] 643 and x5,x9,#0x03ffffff 644 umlal $ACC0,$IN01_0,${R0}[0] 645 ubfx x6,x8,#26,#26 646 umlal $ACC1,$IN01_0,${R1}[0] 647 ubfx x7,x9,#26,#26 648 649 add $IN01_3,$IN01_3,$H3 650 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 651 umlal $ACC3,$IN01_1,${R2}[0] 652 extr x8,x12,x8,#52 653 umlal $ACC4,$IN01_1,${R3}[0] 654 extr x9,x13,x9,#52 655 umlal $ACC0,$IN01_1,${S4}[0] 656 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 657 umlal $ACC2,$IN01_1,${R1}[0] 658 fmov $IN01_0,x4 659 umlal $ACC1,$IN01_1,${R0}[0] 660 and x8,x8,#0x03ffffff 661 662 add $IN01_4,$IN01_4,$H4 663 and x9,x9,#0x03ffffff 664 umlal $ACC3,$IN01_3,${R0}[0] 665 ubfx x10,x12,#14,#26 666 umlal $ACC0,$IN01_3,${S2}[0] 667 ubfx x11,x13,#14,#26 668 umlal $ACC4,$IN01_3,${R1}[0] 669 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 670 umlal $ACC1,$IN01_3,${S3}[0] 671 fmov $IN01_1,x6 672 umlal $ACC2,$IN01_3,${S4}[0] 673 add x12,$padbit,x12,lsr#40 674 675 umlal $ACC3,$IN01_4,${S4}[0] 676 add x13,$padbit,x13,lsr#40 677 umlal $ACC0,$IN01_4,${S1}[0] 678 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 679 umlal $ACC4,$IN01_4,${R0}[0] 680 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 681 umlal $ACC1,$IN01_4,${S2}[0] 682 fmov $IN01_2,x8 683 umlal $ACC2,$IN01_4,${S3}[0] 684 fmov $IN01_3,x10 685 fmov $IN01_4,x12 686 687 ///////////////////////////////////////////////////////////////// 688 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 689 // and P. Schwabe 690 // 691 // [see discussion in poly1305-armv4 module] 692 693 ushr $T0.2d,$ACC3,#26 694 xtn $H3,$ACC3 695 ushr $T1.2d,$ACC0,#26 696 and $ACC0,$ACC0,$MASK.2d 697 add $ACC4,$ACC4,$T0.2d // h3 -> h4 698 bic $H3,#0xfc,lsl#24 // &=0x03ffffff 699 add $ACC1,$ACC1,$T1.2d // h0 -> h1 700 701 ushr $T0.2d,$ACC4,#26 702 xtn $H4,$ACC4 703 ushr $T1.2d,$ACC1,#26 704 xtn $H1,$ACC1 705 bic $H4,#0xfc,lsl#24 706 add $ACC2,$ACC2,$T1.2d // h1 -> h2 707 708 add $ACC0,$ACC0,$T0.2d 709 shl $T0.2d,$T0.2d,#2 710 shrn $T1.2s,$ACC2,#26 711 xtn $H2,$ACC2 712 add $ACC0,$ACC0,$T0.2d // h4 -> h0 713 bic $H1,#0xfc,lsl#24 714 add $H3,$H3,$T1.2s // h2 -> h3 715 bic $H2,#0xfc,lsl#24 716 717 shrn $T0.2s,$ACC0,#26 718 xtn $H0,$ACC0 719 ushr $T1.2s,$H3,#26 720 bic $H3,#0xfc,lsl#24 721 bic $H0,#0xfc,lsl#24 722 add $H1,$H1,$T0.2s // h0 -> h1 723 add $H4,$H4,$T1.2s // h3 -> h4 724 725 b.hi .Loop_neon 726 727.Lskip_loop: 728 dup $IN23_2,${IN23_2}[0] 729 add $IN01_2,$IN01_2,$H2 730 731 //////////////////////////////////////////////////////////////// 732 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 733 734 adds $len,$len,#32 735 b.ne .Long_tail 736 737 dup $IN23_2,${IN01_2}[0] 738 add $IN23_0,$IN01_0,$H0 739 add $IN23_3,$IN01_3,$H3 740 add $IN23_1,$IN01_1,$H1 741 add $IN23_4,$IN01_4,$H4 742 743.Long_tail: 744 dup $IN23_0,${IN23_0}[0] 745 umull2 $ACC0,$IN23_2,${S3} 746 umull2 $ACC3,$IN23_2,${R1} 747 umull2 $ACC4,$IN23_2,${R2} 748 umull2 $ACC2,$IN23_2,${R0} 749 umull2 $ACC1,$IN23_2,${S4} 750 751 dup $IN23_1,${IN23_1}[0] 752 umlal2 $ACC0,$IN23_0,${R0} 753 umlal2 $ACC2,$IN23_0,${R2} 754 umlal2 $ACC3,$IN23_0,${R3} 755 umlal2 $ACC4,$IN23_0,${R4} 756 umlal2 $ACC1,$IN23_0,${R1} 757 758 dup $IN23_3,${IN23_3}[0] 759 umlal2 $ACC0,$IN23_1,${S4} 760 umlal2 $ACC3,$IN23_1,${R2} 761 umlal2 $ACC2,$IN23_1,${R1} 762 umlal2 $ACC4,$IN23_1,${R3} 763 umlal2 $ACC1,$IN23_1,${R0} 764 765 dup $IN23_4,${IN23_4}[0] 766 umlal2 $ACC3,$IN23_3,${R0} 767 umlal2 $ACC4,$IN23_3,${R1} 768 umlal2 $ACC0,$IN23_3,${S2} 769 umlal2 $ACC1,$IN23_3,${S3} 770 umlal2 $ACC2,$IN23_3,${S4} 771 772 umlal2 $ACC3,$IN23_4,${S4} 773 umlal2 $ACC0,$IN23_4,${S1} 774 umlal2 $ACC4,$IN23_4,${R0} 775 umlal2 $ACC1,$IN23_4,${S2} 776 umlal2 $ACC2,$IN23_4,${S3} 777 778 b.eq .Lshort_tail 779 780 //////////////////////////////////////////////////////////////// 781 // (hash+inp[0:1])*r^4:r^3 and accumulate 782 783 add $IN01_0,$IN01_0,$H0 784 umlal $ACC3,$IN01_2,${R1} 785 umlal $ACC0,$IN01_2,${S3} 786 umlal $ACC4,$IN01_2,${R2} 787 umlal $ACC1,$IN01_2,${S4} 788 umlal $ACC2,$IN01_2,${R0} 789 790 add $IN01_1,$IN01_1,$H1 791 umlal $ACC3,$IN01_0,${R3} 792 umlal $ACC0,$IN01_0,${R0} 793 umlal $ACC4,$IN01_0,${R4} 794 umlal $ACC1,$IN01_0,${R1} 795 umlal $ACC2,$IN01_0,${R2} 796 797 add $IN01_3,$IN01_3,$H3 798 umlal $ACC3,$IN01_1,${R2} 799 umlal $ACC0,$IN01_1,${S4} 800 umlal $ACC4,$IN01_1,${R3} 801 umlal $ACC1,$IN01_1,${R0} 802 umlal $ACC2,$IN01_1,${R1} 803 804 add $IN01_4,$IN01_4,$H4 805 umlal $ACC3,$IN01_3,${R0} 806 umlal $ACC0,$IN01_3,${S2} 807 umlal $ACC4,$IN01_3,${R1} 808 umlal $ACC1,$IN01_3,${S3} 809 umlal $ACC2,$IN01_3,${S4} 810 811 umlal $ACC3,$IN01_4,${S4} 812 umlal $ACC0,$IN01_4,${S1} 813 umlal $ACC4,$IN01_4,${R0} 814 umlal $ACC1,$IN01_4,${S2} 815 umlal $ACC2,$IN01_4,${S3} 816 817.Lshort_tail: 818 //////////////////////////////////////////////////////////////// 819 // horizontal add 820 821 addp $ACC3,$ACC3,$ACC3 822 ldp d8,d9,[sp,#16] // meet ABI requirements 823 addp $ACC0,$ACC0,$ACC0 824 ldp d10,d11,[sp,#32] 825 addp $ACC4,$ACC4,$ACC4 826 ldp d12,d13,[sp,#48] 827 addp $ACC1,$ACC1,$ACC1 828 ldp d14,d15,[sp,#64] 829 addp $ACC2,$ACC2,$ACC2 830 831 //////////////////////////////////////////////////////////////// 832 // lazy reduction, but without narrowing 833 834 ushr $T0.2d,$ACC3,#26 835 and $ACC3,$ACC3,$MASK.2d 836 ushr $T1.2d,$ACC0,#26 837 and $ACC0,$ACC0,$MASK.2d 838 839 add $ACC4,$ACC4,$T0.2d // h3 -> h4 840 add $ACC1,$ACC1,$T1.2d // h0 -> h1 841 842 ushr $T0.2d,$ACC4,#26 843 and $ACC4,$ACC4,$MASK.2d 844 ushr $T1.2d,$ACC1,#26 845 and $ACC1,$ACC1,$MASK.2d 846 add $ACC2,$ACC2,$T1.2d // h1 -> h2 847 848 add $ACC0,$ACC0,$T0.2d 849 shl $T0.2d,$T0.2d,#2 850 ushr $T1.2d,$ACC2,#26 851 and $ACC2,$ACC2,$MASK.2d 852 add $ACC0,$ACC0,$T0.2d // h4 -> h0 853 add $ACC3,$ACC3,$T1.2d // h2 -> h3 854 855 ushr $T0.2d,$ACC0,#26 856 and $ACC0,$ACC0,$MASK.2d 857 ushr $T1.2d,$ACC3,#26 858 and $ACC3,$ACC3,$MASK.2d 859 add $ACC1,$ACC1,$T0.2d // h0 -> h1 860 add $ACC4,$ACC4,$T1.2d // h3 -> h4 861 862 //////////////////////////////////////////////////////////////// 863 // write the result, can be partially reduced 864 865 st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 866 st1 {$ACC4}[0],[$ctx] 867 868.Lno_data_neon: 869 ldr x29,[sp],#80 870 .inst 0xd50323bf // autiasp 871 ret 872.size poly1305_blocks_neon,.-poly1305_blocks_neon 873 874.type poly1305_emit_neon,%function 875.align 5 876poly1305_emit_neon: 877.Lpoly1305_emit_neon: 878 ldr $is_base2_26,[$ctx,#24] 879 cbz $is_base2_26,poly1305_emit 880 881 ldp w10,w11,[$ctx] // load hash value base 2^26 882 ldp w12,w13,[$ctx,#8] 883 ldr w14,[$ctx,#16] 884 885 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 886 lsr $h1,x12,#12 887 adds $h0,$h0,x12,lsl#52 888 add $h1,$h1,x13,lsl#14 889 adc $h1,$h1,xzr 890 lsr $h2,x14,#24 891 adds $h1,$h1,x14,lsl#40 892 adc $h2,$h2,xzr // can be partially reduced... 893 894 ldp $t0,$t1,[$nonce] // load nonce 895 896 and $d0,$h2,#-4 // ... so reduce 897 add $d0,$d0,$h2,lsr#2 898 and $h2,$h2,#3 899 adds $h0,$h0,$d0 900 adcs $h1,$h1,xzr 901 adc $h2,$h2,xzr 902 903 adds $d0,$h0,#5 // compare to modulus 904 adcs $d1,$h1,xzr 905 adc $d2,$h2,xzr 906 907 tst $d2,#-4 // see if it's carried/borrowed 908 909 csel $h0,$h0,$d0,eq 910 csel $h1,$h1,$d1,eq 911 912#ifdef __ARMEB__ 913 ror $t0,$t0,#32 // flip nonce words 914 ror $t1,$t1,#32 915#endif 916 adds $h0,$h0,$t0 // accumulate nonce 917 adc $h1,$h1,$t1 918#ifdef __ARMEB__ 919 rev $h0,$h0 // flip output bytes 920 rev $h1,$h1 921#endif 922 stp $h0,$h1,[$mac] // write result 923 924 ret 925.size poly1305_emit_neon,.-poly1305_emit_neon 926 927.align 5 928.Lzeros: 929.long 0,0,0,0,0,0,0,0 930.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 931.align 2 932___ 933 934foreach (split("\n",$code)) { 935 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 936 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 937 (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 938 (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 939 (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 940 (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 941 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 942 943 s/\.[124]([sd])\[/.$1\[/; 944 945 print $_,"\n"; 946} 947close STDOUT or die "error closing STDOUT: $!"; 948