1#! /usr/bin/env perl 2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# Kryo 1.26 0.94 1.00 39# 40# (*) original 3.64/1.34/1.32 results were for r0p0 revision 41# and are still same even for updated module; 42 43$flavour = shift; 44$output = shift; 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 49die "can't locate arm-xlate.pl"; 50 51open OUT,"| \"$^X\" $xlate $flavour $output"; 52*STDOUT=*OUT; 53 54$prefix="aes_v8"; 55 56$code=<<___; 57#include "arm_arch.h" 58 59#if __ARM_MAX_ARCH__>=7 60.text 61___ 62# $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 63$code.=<<___ if ($flavour !~ /64/); 64.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 65.fpu neon 66.code 32 67#undef __thumb2__ 68___ 69 70# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 71# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 72# maintain both 32- and 64-bit codes within single module and 73# transliterate common code to either flavour with regex vodoo. 74# 75{{{ 76my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 77my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 78 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 79 80 81$code.=<<___; 82.align 5 83.Lrcon: 84.long 0x01,0x01,0x01,0x01 85.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 86.long 0x1b,0x1b,0x1b,0x1b 87 88.globl ${prefix}_set_encrypt_key 89.type ${prefix}_set_encrypt_key,%function 90.align 5 91${prefix}_set_encrypt_key: 92.Lenc_key: 93___ 94$code.=<<___ if ($flavour =~ /64/); 95 stp x29,x30,[sp,#-16]! 96 add x29,sp,#0 97___ 98$code.=<<___; 99 mov $ptr,#-1 100 cmp $inp,#0 101 b.eq .Lenc_key_abort 102 cmp $out,#0 103 b.eq .Lenc_key_abort 104 mov $ptr,#-2 105 cmp $bits,#128 106 b.lt .Lenc_key_abort 107 cmp $bits,#256 108 b.gt .Lenc_key_abort 109 tst $bits,#0x3f 110 b.ne .Lenc_key_abort 111 112 adr $ptr,.Lrcon 113 cmp $bits,#192 114 115 veor $zero,$zero,$zero 116 vld1.8 {$in0},[$inp],#16 117 mov $bits,#8 // reuse $bits 118 vld1.32 {$rcon,$mask},[$ptr],#32 119 120 b.lt .Loop128 121 b.eq .L192 122 b .L256 123 124.align 4 125.Loop128: 126 vtbl.8 $key,{$in0},$mask 127 vext.8 $tmp,$zero,$in0,#12 128 vst1.32 {$in0},[$out],#16 129 aese $key,$zero 130 subs $bits,$bits,#1 131 132 veor $in0,$in0,$tmp 133 vext.8 $tmp,$zero,$tmp,#12 134 veor $in0,$in0,$tmp 135 vext.8 $tmp,$zero,$tmp,#12 136 veor $key,$key,$rcon 137 veor $in0,$in0,$tmp 138 vshl.u8 $rcon,$rcon,#1 139 veor $in0,$in0,$key 140 b.ne .Loop128 141 142 vld1.32 {$rcon},[$ptr] 143 144 vtbl.8 $key,{$in0},$mask 145 vext.8 $tmp,$zero,$in0,#12 146 vst1.32 {$in0},[$out],#16 147 aese $key,$zero 148 149 veor $in0,$in0,$tmp 150 vext.8 $tmp,$zero,$tmp,#12 151 veor $in0,$in0,$tmp 152 vext.8 $tmp,$zero,$tmp,#12 153 veor $key,$key,$rcon 154 veor $in0,$in0,$tmp 155 vshl.u8 $rcon,$rcon,#1 156 veor $in0,$in0,$key 157 158 vtbl.8 $key,{$in0},$mask 159 vext.8 $tmp,$zero,$in0,#12 160 vst1.32 {$in0},[$out],#16 161 aese $key,$zero 162 163 veor $in0,$in0,$tmp 164 vext.8 $tmp,$zero,$tmp,#12 165 veor $in0,$in0,$tmp 166 vext.8 $tmp,$zero,$tmp,#12 167 veor $key,$key,$rcon 168 veor $in0,$in0,$tmp 169 veor $in0,$in0,$key 170 vst1.32 {$in0},[$out] 171 add $out,$out,#0x50 172 173 mov $rounds,#10 174 b .Ldone 175 176.align 4 177.L192: 178 vld1.8 {$in1},[$inp],#8 179 vmov.i8 $key,#8 // borrow $key 180 vst1.32 {$in0},[$out],#16 181 vsub.i8 $mask,$mask,$key // adjust the mask 182 183.Loop192: 184 vtbl.8 $key,{$in1},$mask 185 vext.8 $tmp,$zero,$in0,#12 186 vst1.32 {$in1},[$out],#8 187 aese $key,$zero 188 subs $bits,$bits,#1 189 190 veor $in0,$in0,$tmp 191 vext.8 $tmp,$zero,$tmp,#12 192 veor $in0,$in0,$tmp 193 vext.8 $tmp,$zero,$tmp,#12 194 veor $in0,$in0,$tmp 195 196 vdup.32 $tmp,${in0}[3] 197 veor $tmp,$tmp,$in1 198 veor $key,$key,$rcon 199 vext.8 $in1,$zero,$in1,#12 200 vshl.u8 $rcon,$rcon,#1 201 veor $in1,$in1,$tmp 202 veor $in0,$in0,$key 203 veor $in1,$in1,$key 204 vst1.32 {$in0},[$out],#16 205 b.ne .Loop192 206 207 mov $rounds,#12 208 add $out,$out,#0x20 209 b .Ldone 210 211.align 4 212.L256: 213 vld1.8 {$in1},[$inp] 214 mov $bits,#7 215 mov $rounds,#14 216 vst1.32 {$in0},[$out],#16 217 218.Loop256: 219 vtbl.8 $key,{$in1},$mask 220 vext.8 $tmp,$zero,$in0,#12 221 vst1.32 {$in1},[$out],#16 222 aese $key,$zero 223 subs $bits,$bits,#1 224 225 veor $in0,$in0,$tmp 226 vext.8 $tmp,$zero,$tmp,#12 227 veor $in0,$in0,$tmp 228 vext.8 $tmp,$zero,$tmp,#12 229 veor $key,$key,$rcon 230 veor $in0,$in0,$tmp 231 vshl.u8 $rcon,$rcon,#1 232 veor $in0,$in0,$key 233 vst1.32 {$in0},[$out],#16 234 b.eq .Ldone 235 236 vdup.32 $key,${in0}[3] // just splat 237 vext.8 $tmp,$zero,$in1,#12 238 aese $key,$zero 239 240 veor $in1,$in1,$tmp 241 vext.8 $tmp,$zero,$tmp,#12 242 veor $in1,$in1,$tmp 243 vext.8 $tmp,$zero,$tmp,#12 244 veor $in1,$in1,$tmp 245 246 veor $in1,$in1,$key 247 b .Loop256 248 249.Ldone: 250 str $rounds,[$out] 251 mov $ptr,#0 252 253.Lenc_key_abort: 254 mov x0,$ptr // return value 255 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 256 ret 257.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 258 259.globl ${prefix}_set_decrypt_key 260.type ${prefix}_set_decrypt_key,%function 261.align 5 262${prefix}_set_decrypt_key: 263___ 264$code.=<<___ if ($flavour =~ /64/); 265 .inst 0xd503233f // paciasp 266 stp x29,x30,[sp,#-16]! 267 add x29,sp,#0 268___ 269$code.=<<___ if ($flavour !~ /64/); 270 stmdb sp!,{r4,lr} 271___ 272$code.=<<___; 273 bl .Lenc_key 274 275 cmp x0,#0 276 b.ne .Ldec_key_abort 277 278 sub $out,$out,#240 // restore original $out 279 mov x4,#-16 280 add $inp,$out,x12,lsl#4 // end of key schedule 281 282 vld1.32 {v0.16b},[$out] 283 vld1.32 {v1.16b},[$inp] 284 vst1.32 {v0.16b},[$inp],x4 285 vst1.32 {v1.16b},[$out],#16 286 287.Loop_imc: 288 vld1.32 {v0.16b},[$out] 289 vld1.32 {v1.16b},[$inp] 290 aesimc v0.16b,v0.16b 291 aesimc v1.16b,v1.16b 292 vst1.32 {v0.16b},[$inp],x4 293 vst1.32 {v1.16b},[$out],#16 294 cmp $inp,$out 295 b.hi .Loop_imc 296 297 vld1.32 {v0.16b},[$out] 298 aesimc v0.16b,v0.16b 299 vst1.32 {v0.16b},[$inp] 300 301 eor x0,x0,x0 // return value 302.Ldec_key_abort: 303___ 304$code.=<<___ if ($flavour !~ /64/); 305 ldmia sp!,{r4,pc} 306___ 307$code.=<<___ if ($flavour =~ /64/); 308 ldp x29,x30,[sp],#16 309 .inst 0xd50323bf // autiasp 310 ret 311___ 312$code.=<<___; 313.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 314___ 315}}} 316{{{ 317sub gen_block () { 318my $dir = shift; 319my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 320my ($inp,$out,$key)=map("x$_",(0..2)); 321my $rounds="w3"; 322my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 323 324$code.=<<___; 325.globl ${prefix}_${dir}crypt 326.type ${prefix}_${dir}crypt,%function 327.align 5 328${prefix}_${dir}crypt: 329 ldr $rounds,[$key,#240] 330 vld1.32 {$rndkey0},[$key],#16 331 vld1.8 {$inout},[$inp] 332 sub $rounds,$rounds,#2 333 vld1.32 {$rndkey1},[$key],#16 334 335.Loop_${dir}c: 336 aes$e $inout,$rndkey0 337 aes$mc $inout,$inout 338 vld1.32 {$rndkey0},[$key],#16 339 subs $rounds,$rounds,#2 340 aes$e $inout,$rndkey1 341 aes$mc $inout,$inout 342 vld1.32 {$rndkey1},[$key],#16 343 b.gt .Loop_${dir}c 344 345 aes$e $inout,$rndkey0 346 aes$mc $inout,$inout 347 vld1.32 {$rndkey0},[$key] 348 aes$e $inout,$rndkey1 349 veor $inout,$inout,$rndkey0 350 351 vst1.8 {$inout},[$out] 352 ret 353.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 354___ 355} 356&gen_block("en"); 357&gen_block("de"); 358}}} 359{{{ 360my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 361my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 362my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 363 364my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 365my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 366 367### q8-q15 preloaded key schedule 368 369$code.=<<___; 370.globl ${prefix}_cbc_encrypt 371.type ${prefix}_cbc_encrypt,%function 372.align 5 373${prefix}_cbc_encrypt: 374___ 375$code.=<<___ if ($flavour =~ /64/); 376 stp x29,x30,[sp,#-16]! 377 add x29,sp,#0 378___ 379$code.=<<___ if ($flavour !~ /64/); 380 mov ip,sp 381 stmdb sp!,{r4-r8,lr} 382 vstmdb sp!,{d8-d15} @ ABI specification says so 383 ldmia ip,{r4-r5} @ load remaining args 384___ 385$code.=<<___; 386 subs $len,$len,#16 387 mov $step,#16 388 b.lo .Lcbc_abort 389 cclr $step,eq 390 391 cmp $enc,#0 // en- or decrypting? 392 ldr $rounds,[$key,#240] 393 and $len,$len,#-16 394 vld1.8 {$ivec},[$ivp] 395 vld1.8 {$dat},[$inp],$step 396 397 vld1.32 {q8-q9},[$key] // load key schedule... 398 sub $rounds,$rounds,#6 399 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 400 sub $rounds,$rounds,#2 401 vld1.32 {q10-q11},[$key_],#32 402 vld1.32 {q12-q13},[$key_],#32 403 vld1.32 {q14-q15},[$key_],#32 404 vld1.32 {$rndlast},[$key_] 405 406 add $key_,$key,#32 407 mov $cnt,$rounds 408 b.eq .Lcbc_dec 409 410 cmp $rounds,#2 411 veor $dat,$dat,$ivec 412 veor $rndzero_n_last,q8,$rndlast 413 b.eq .Lcbc_enc128 414 415 vld1.32 {$in0-$in1},[$key_] 416 add $key_,$key,#16 417 add $key4,$key,#16*4 418 add $key5,$key,#16*5 419 aese $dat,q8 420 aesmc $dat,$dat 421 add $key6,$key,#16*6 422 add $key7,$key,#16*7 423 b .Lenter_cbc_enc 424 425.align 4 426.Loop_cbc_enc: 427 aese $dat,q8 428 aesmc $dat,$dat 429 vst1.8 {$ivec},[$out],#16 430.Lenter_cbc_enc: 431 aese $dat,q9 432 aesmc $dat,$dat 433 aese $dat,$in0 434 aesmc $dat,$dat 435 vld1.32 {q8},[$key4] 436 cmp $rounds,#4 437 aese $dat,$in1 438 aesmc $dat,$dat 439 vld1.32 {q9},[$key5] 440 b.eq .Lcbc_enc192 441 442 aese $dat,q8 443 aesmc $dat,$dat 444 vld1.32 {q8},[$key6] 445 aese $dat,q9 446 aesmc $dat,$dat 447 vld1.32 {q9},[$key7] 448 nop 449 450.Lcbc_enc192: 451 aese $dat,q8 452 aesmc $dat,$dat 453 subs $len,$len,#16 454 aese $dat,q9 455 aesmc $dat,$dat 456 cclr $step,eq 457 aese $dat,q10 458 aesmc $dat,$dat 459 aese $dat,q11 460 aesmc $dat,$dat 461 vld1.8 {q8},[$inp],$step 462 aese $dat,q12 463 aesmc $dat,$dat 464 veor q8,q8,$rndzero_n_last 465 aese $dat,q13 466 aesmc $dat,$dat 467 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 468 aese $dat,q14 469 aesmc $dat,$dat 470 aese $dat,q15 471 veor $ivec,$dat,$rndlast 472 b.hs .Loop_cbc_enc 473 474 vst1.8 {$ivec},[$out],#16 475 b .Lcbc_done 476 477.align 5 478.Lcbc_enc128: 479 vld1.32 {$in0-$in1},[$key_] 480 aese $dat,q8 481 aesmc $dat,$dat 482 b .Lenter_cbc_enc128 483.Loop_cbc_enc128: 484 aese $dat,q8 485 aesmc $dat,$dat 486 vst1.8 {$ivec},[$out],#16 487.Lenter_cbc_enc128: 488 aese $dat,q9 489 aesmc $dat,$dat 490 subs $len,$len,#16 491 aese $dat,$in0 492 aesmc $dat,$dat 493 cclr $step,eq 494 aese $dat,$in1 495 aesmc $dat,$dat 496 aese $dat,q10 497 aesmc $dat,$dat 498 aese $dat,q11 499 aesmc $dat,$dat 500 vld1.8 {q8},[$inp],$step 501 aese $dat,q12 502 aesmc $dat,$dat 503 aese $dat,q13 504 aesmc $dat,$dat 505 aese $dat,q14 506 aesmc $dat,$dat 507 veor q8,q8,$rndzero_n_last 508 aese $dat,q15 509 veor $ivec,$dat,$rndlast 510 b.hs .Loop_cbc_enc128 511 512 vst1.8 {$ivec},[$out],#16 513 b .Lcbc_done 514___ 515{ 516my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 517$code.=<<___; 518.align 5 519.Lcbc_dec: 520 vld1.8 {$dat2},[$inp],#16 521 subs $len,$len,#32 // bias 522 add $cnt,$rounds,#2 523 vorr $in1,$dat,$dat 524 vorr $dat1,$dat,$dat 525 vorr $in2,$dat2,$dat2 526 b.lo .Lcbc_dec_tail 527 528 vorr $dat1,$dat2,$dat2 529 vld1.8 {$dat2},[$inp],#16 530 vorr $in0,$dat,$dat 531 vorr $in1,$dat1,$dat1 532 vorr $in2,$dat2,$dat2 533 534.Loop3x_cbc_dec: 535 aesd $dat0,q8 536 aesimc $dat0,$dat0 537 aesd $dat1,q8 538 aesimc $dat1,$dat1 539 aesd $dat2,q8 540 aesimc $dat2,$dat2 541 vld1.32 {q8},[$key_],#16 542 subs $cnt,$cnt,#2 543 aesd $dat0,q9 544 aesimc $dat0,$dat0 545 aesd $dat1,q9 546 aesimc $dat1,$dat1 547 aesd $dat2,q9 548 aesimc $dat2,$dat2 549 vld1.32 {q9},[$key_],#16 550 b.gt .Loop3x_cbc_dec 551 552 aesd $dat0,q8 553 aesimc $dat0,$dat0 554 aesd $dat1,q8 555 aesimc $dat1,$dat1 556 aesd $dat2,q8 557 aesimc $dat2,$dat2 558 veor $tmp0,$ivec,$rndlast 559 subs $len,$len,#0x30 560 veor $tmp1,$in0,$rndlast 561 mov.lo x6,$len // x6, $cnt, is zero at this point 562 aesd $dat0,q9 563 aesimc $dat0,$dat0 564 aesd $dat1,q9 565 aesimc $dat1,$dat1 566 aesd $dat2,q9 567 aesimc $dat2,$dat2 568 veor $tmp2,$in1,$rndlast 569 add $inp,$inp,x6 // $inp is adjusted in such way that 570 // at exit from the loop $dat1-$dat2 571 // are loaded with last "words" 572 vorr $ivec,$in2,$in2 573 mov $key_,$key 574 aesd $dat0,q12 575 aesimc $dat0,$dat0 576 aesd $dat1,q12 577 aesimc $dat1,$dat1 578 aesd $dat2,q12 579 aesimc $dat2,$dat2 580 vld1.8 {$in0},[$inp],#16 581 aesd $dat0,q13 582 aesimc $dat0,$dat0 583 aesd $dat1,q13 584 aesimc $dat1,$dat1 585 aesd $dat2,q13 586 aesimc $dat2,$dat2 587 vld1.8 {$in1},[$inp],#16 588 aesd $dat0,q14 589 aesimc $dat0,$dat0 590 aesd $dat1,q14 591 aesimc $dat1,$dat1 592 aesd $dat2,q14 593 aesimc $dat2,$dat2 594 vld1.8 {$in2},[$inp],#16 595 aesd $dat0,q15 596 aesd $dat1,q15 597 aesd $dat2,q15 598 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 599 add $cnt,$rounds,#2 600 veor $tmp0,$tmp0,$dat0 601 veor $tmp1,$tmp1,$dat1 602 veor $dat2,$dat2,$tmp2 603 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 604 vst1.8 {$tmp0},[$out],#16 605 vorr $dat0,$in0,$in0 606 vst1.8 {$tmp1},[$out],#16 607 vorr $dat1,$in1,$in1 608 vst1.8 {$dat2},[$out],#16 609 vorr $dat2,$in2,$in2 610 b.hs .Loop3x_cbc_dec 611 612 cmn $len,#0x30 613 b.eq .Lcbc_done 614 nop 615 616.Lcbc_dec_tail: 617 aesd $dat1,q8 618 aesimc $dat1,$dat1 619 aesd $dat2,q8 620 aesimc $dat2,$dat2 621 vld1.32 {q8},[$key_],#16 622 subs $cnt,$cnt,#2 623 aesd $dat1,q9 624 aesimc $dat1,$dat1 625 aesd $dat2,q9 626 aesimc $dat2,$dat2 627 vld1.32 {q9},[$key_],#16 628 b.gt .Lcbc_dec_tail 629 630 aesd $dat1,q8 631 aesimc $dat1,$dat1 632 aesd $dat2,q8 633 aesimc $dat2,$dat2 634 aesd $dat1,q9 635 aesimc $dat1,$dat1 636 aesd $dat2,q9 637 aesimc $dat2,$dat2 638 aesd $dat1,q12 639 aesimc $dat1,$dat1 640 aesd $dat2,q12 641 aesimc $dat2,$dat2 642 cmn $len,#0x20 643 aesd $dat1,q13 644 aesimc $dat1,$dat1 645 aesd $dat2,q13 646 aesimc $dat2,$dat2 647 veor $tmp1,$ivec,$rndlast 648 aesd $dat1,q14 649 aesimc $dat1,$dat1 650 aesd $dat2,q14 651 aesimc $dat2,$dat2 652 veor $tmp2,$in1,$rndlast 653 aesd $dat1,q15 654 aesd $dat2,q15 655 b.eq .Lcbc_dec_one 656 veor $tmp1,$tmp1,$dat1 657 veor $tmp2,$tmp2,$dat2 658 vorr $ivec,$in2,$in2 659 vst1.8 {$tmp1},[$out],#16 660 vst1.8 {$tmp2},[$out],#16 661 b .Lcbc_done 662 663.Lcbc_dec_one: 664 veor $tmp1,$tmp1,$dat2 665 vorr $ivec,$in2,$in2 666 vst1.8 {$tmp1},[$out],#16 667 668.Lcbc_done: 669 vst1.8 {$ivec},[$ivp] 670.Lcbc_abort: 671___ 672} 673$code.=<<___ if ($flavour !~ /64/); 674 vldmia sp!,{d8-d15} 675 ldmia sp!,{r4-r8,pc} 676___ 677$code.=<<___ if ($flavour =~ /64/); 678 ldr x29,[sp],#16 679 ret 680___ 681$code.=<<___; 682.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 683___ 684}}} 685{{{ 686my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 687my ($rounds,$cnt,$key_)=("w5","w6","x7"); 688my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 689my $step="x12"; # aliases with $tctr2 690 691my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 692my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 693 694my ($dat,$tmp)=($dat0,$tmp0); 695 696### q8-q15 preloaded key schedule 697 698$code.=<<___; 699.globl ${prefix}_ctr32_encrypt_blocks 700.type ${prefix}_ctr32_encrypt_blocks,%function 701.align 5 702${prefix}_ctr32_encrypt_blocks: 703___ 704$code.=<<___ if ($flavour =~ /64/); 705 stp x29,x30,[sp,#-16]! 706 add x29,sp,#0 707___ 708$code.=<<___ if ($flavour !~ /64/); 709 mov ip,sp 710 stmdb sp!,{r4-r10,lr} 711 vstmdb sp!,{d8-d15} @ ABI specification says so 712 ldr r4, [ip] @ load remaining arg 713___ 714$code.=<<___; 715 ldr $rounds,[$key,#240] 716 717 ldr $ctr, [$ivp, #12] 718 vld1.32 {$dat0},[$ivp] 719 720 vld1.32 {q8-q9},[$key] // load key schedule... 721 sub $rounds,$rounds,#4 722 mov $step,#16 723 cmp $len,#2 724 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 725 sub $rounds,$rounds,#2 726 vld1.32 {q12-q13},[$key_],#32 727 vld1.32 {q14-q15},[$key_],#32 728 vld1.32 {$rndlast},[$key_] 729 add $key_,$key,#32 730 mov $cnt,$rounds 731 cclr $step,lo 732#ifndef __ARMEB__ 733 rev $ctr, $ctr 734#endif 735 vorr $dat1,$dat0,$dat0 736 add $tctr1, $ctr, #1 737 vorr $dat2,$dat0,$dat0 738 add $ctr, $ctr, #2 739 vorr $ivec,$dat0,$dat0 740 rev $tctr1, $tctr1 741 vmov.32 ${dat1}[3],$tctr1 742 b.ls .Lctr32_tail 743 rev $tctr2, $ctr 744 sub $len,$len,#3 // bias 745 vmov.32 ${dat2}[3],$tctr2 746 b .Loop3x_ctr32 747 748.align 4 749.Loop3x_ctr32: 750 aese $dat0,q8 751 aesmc $dat0,$dat0 752 aese $dat1,q8 753 aesmc $dat1,$dat1 754 aese $dat2,q8 755 aesmc $dat2,$dat2 756 vld1.32 {q8},[$key_],#16 757 subs $cnt,$cnt,#2 758 aese $dat0,q9 759 aesmc $dat0,$dat0 760 aese $dat1,q9 761 aesmc $dat1,$dat1 762 aese $dat2,q9 763 aesmc $dat2,$dat2 764 vld1.32 {q9},[$key_],#16 765 b.gt .Loop3x_ctr32 766 767 aese $dat0,q8 768 aesmc $tmp0,$dat0 769 aese $dat1,q8 770 aesmc $tmp1,$dat1 771 vld1.8 {$in0},[$inp],#16 772 vorr $dat0,$ivec,$ivec 773 aese $dat2,q8 774 aesmc $dat2,$dat2 775 vld1.8 {$in1},[$inp],#16 776 vorr $dat1,$ivec,$ivec 777 aese $tmp0,q9 778 aesmc $tmp0,$tmp0 779 aese $tmp1,q9 780 aesmc $tmp1,$tmp1 781 vld1.8 {$in2},[$inp],#16 782 mov $key_,$key 783 aese $dat2,q9 784 aesmc $tmp2,$dat2 785 vorr $dat2,$ivec,$ivec 786 add $tctr0,$ctr,#1 787 aese $tmp0,q12 788 aesmc $tmp0,$tmp0 789 aese $tmp1,q12 790 aesmc $tmp1,$tmp1 791 veor $in0,$in0,$rndlast 792 add $tctr1,$ctr,#2 793 aese $tmp2,q12 794 aesmc $tmp2,$tmp2 795 veor $in1,$in1,$rndlast 796 add $ctr,$ctr,#3 797 aese $tmp0,q13 798 aesmc $tmp0,$tmp0 799 aese $tmp1,q13 800 aesmc $tmp1,$tmp1 801 veor $in2,$in2,$rndlast 802 rev $tctr0,$tctr0 803 aese $tmp2,q13 804 aesmc $tmp2,$tmp2 805 vmov.32 ${dat0}[3], $tctr0 806 rev $tctr1,$tctr1 807 aese $tmp0,q14 808 aesmc $tmp0,$tmp0 809 aese $tmp1,q14 810 aesmc $tmp1,$tmp1 811 vmov.32 ${dat1}[3], $tctr1 812 rev $tctr2,$ctr 813 aese $tmp2,q14 814 aesmc $tmp2,$tmp2 815 vmov.32 ${dat2}[3], $tctr2 816 subs $len,$len,#3 817 aese $tmp0,q15 818 aese $tmp1,q15 819 aese $tmp2,q15 820 821 veor $in0,$in0,$tmp0 822 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 823 vst1.8 {$in0},[$out],#16 824 veor $in1,$in1,$tmp1 825 mov $cnt,$rounds 826 vst1.8 {$in1},[$out],#16 827 veor $in2,$in2,$tmp2 828 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 829 vst1.8 {$in2},[$out],#16 830 b.hs .Loop3x_ctr32 831 832 adds $len,$len,#3 833 b.eq .Lctr32_done 834 cmp $len,#1 835 mov $step,#16 836 cclr $step,eq 837 838.Lctr32_tail: 839 aese $dat0,q8 840 aesmc $dat0,$dat0 841 aese $dat1,q8 842 aesmc $dat1,$dat1 843 vld1.32 {q8},[$key_],#16 844 subs $cnt,$cnt,#2 845 aese $dat0,q9 846 aesmc $dat0,$dat0 847 aese $dat1,q9 848 aesmc $dat1,$dat1 849 vld1.32 {q9},[$key_],#16 850 b.gt .Lctr32_tail 851 852 aese $dat0,q8 853 aesmc $dat0,$dat0 854 aese $dat1,q8 855 aesmc $dat1,$dat1 856 aese $dat0,q9 857 aesmc $dat0,$dat0 858 aese $dat1,q9 859 aesmc $dat1,$dat1 860 vld1.8 {$in0},[$inp],$step 861 aese $dat0,q12 862 aesmc $dat0,$dat0 863 aese $dat1,q12 864 aesmc $dat1,$dat1 865 vld1.8 {$in1},[$inp] 866 aese $dat0,q13 867 aesmc $dat0,$dat0 868 aese $dat1,q13 869 aesmc $dat1,$dat1 870 veor $in0,$in0,$rndlast 871 aese $dat0,q14 872 aesmc $dat0,$dat0 873 aese $dat1,q14 874 aesmc $dat1,$dat1 875 veor $in1,$in1,$rndlast 876 aese $dat0,q15 877 aese $dat1,q15 878 879 cmp $len,#1 880 veor $in0,$in0,$dat0 881 veor $in1,$in1,$dat1 882 vst1.8 {$in0},[$out],#16 883 b.eq .Lctr32_done 884 vst1.8 {$in1},[$out] 885 886.Lctr32_done: 887___ 888$code.=<<___ if ($flavour !~ /64/); 889 vldmia sp!,{d8-d15} 890 ldmia sp!,{r4-r10,pc} 891___ 892$code.=<<___ if ($flavour =~ /64/); 893 ldr x29,[sp],#16 894 ret 895___ 896$code.=<<___; 897.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 898___ 899}}} 900$code.=<<___; 901#endif 902___ 903######################################## 904if ($flavour =~ /64/) { ######## 64-bit code 905 my %opcode = ( 906 "aesd" => 0x4e285800, "aese" => 0x4e284800, 907 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 908 909 local *unaes = sub { 910 my ($mnemonic,$arg)=@_; 911 912 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 913 sprintf ".inst\t0x%08x\t//%s %s", 914 $opcode{$mnemonic}|$1|($2<<5), 915 $mnemonic,$arg; 916 }; 917 918 foreach(split("\n",$code)) { 919 s/\`([^\`]*)\`/eval($1)/geo; 920 921 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 922 s/@\s/\/\//o; # old->new style commentary 923 924 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 925 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 926 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 927 s/vmov\.i8/movi/o or # fix up legacy mnemonics 928 s/vext\.8/ext/o or 929 s/vrev32\.8/rev32/o or 930 s/vtst\.8/cmtst/o or 931 s/vshr/ushr/o or 932 s/^(\s+)v/$1/o or # strip off v prefix 933 s/\bbx\s+lr\b/ret/o; 934 935 # fix up remaining legacy suffixes 936 s/\.[ui]?8//o; 937 m/\],#8/o and s/\.16b/\.8b/go; 938 s/\.[ui]?32//o and s/\.16b/\.4s/go; 939 s/\.[ui]?64//o and s/\.16b/\.2d/go; 940 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 941 942 print $_,"\n"; 943 } 944} else { ######## 32-bit code 945 my %opcode = ( 946 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 947 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 948 949 local *unaes = sub { 950 my ($mnemonic,$arg)=@_; 951 952 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 953 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 954 |(($2&7)<<1) |(($2&8)<<2); 955 # since ARMv7 instructions are always encoded little-endian. 956 # correct solution is to use .inst directive, but older 957 # assemblers don't implement it:-( 958 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 959 $word&0xff,($word>>8)&0xff, 960 ($word>>16)&0xff,($word>>24)&0xff, 961 $mnemonic,$arg; 962 } 963 }; 964 965 sub unvtbl { 966 my $arg=shift; 967 968 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 969 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 970 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 971 } 972 973 sub unvdup32 { 974 my $arg=shift; 975 976 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 977 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 978 } 979 980 sub unvmov32 { 981 my $arg=shift; 982 983 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 984 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 985 } 986 987 foreach(split("\n",$code)) { 988 s/\`([^\`]*)\`/eval($1)/geo; 989 990 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 991 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 992 s/\/\/\s?/@ /o; # new->old style commentary 993 994 # fix up remaining new-style suffixes 995 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 996 s/\],#[0-9]+/]!/o; 997 998 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 999 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 1000 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 1001 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 1002 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1003 s/^(\s+)b\./$1b/o or 1004 s/^(\s+)mov\./$1mov/o or 1005 s/^(\s+)ret/$1bx\tlr/o; 1006 1007 print $_,"\n"; 1008 } 1009} 1010 1011close STDOUT or die "error closing STDOUT: $!"; 1012