1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# Kryo 1.26 0.94 1.00 39# 40# (*) original 3.64/1.34/1.32 results were for r0p0 revision 41# and are still same even for updated module; 42 43$flavour = shift; 44$output = shift; 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 49die "can't locate arm-xlate.pl"; 50 51open OUT,"| \"$^X\" $xlate $flavour $output"; 52*STDOUT=*OUT; 53 54$prefix="aes_v8"; 55 56$code=<<___; 57#include "arm_arch.h" 58 59#if __ARM_MAX_ARCH__>=7 60.text 61___ 62# $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 63$code.=<<___ if ($flavour !~ /64/); 64.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 65.fpu neon 66.code 32 67#undef __thumb2__ 68___ 69 70# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 71# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 72# maintain both 32- and 64-bit codes within single module and 73# transliterate common code to either flavour with regex vodoo. 74# 75{{{ 76my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 77my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 78 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 79 80 81$code.=<<___; 82.align 5 83.Lrcon: 84.long 0x01,0x01,0x01,0x01 85.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 86.long 0x1b,0x1b,0x1b,0x1b 87 88.globl ${prefix}_set_encrypt_key 89.type ${prefix}_set_encrypt_key,%function 90.align 5 91${prefix}_set_encrypt_key: 92.Lenc_key: 93___ 94$code.=<<___ if ($flavour =~ /64/); 95 stp x29,x30,[sp,#-16]! 96 add x29,sp,#0 97___ 98$code.=<<___; 99 mov $ptr,#-1 100 cmp $inp,#0 101 b.eq .Lenc_key_abort 102 cmp $out,#0 103 b.eq .Lenc_key_abort 104 mov $ptr,#-2 105 cmp $bits,#128 106 b.lt .Lenc_key_abort 107 cmp $bits,#256 108 b.gt .Lenc_key_abort 109 tst $bits,#0x3f 110 b.ne .Lenc_key_abort 111 112 adr $ptr,.Lrcon 113 cmp $bits,#192 114 115 veor $zero,$zero,$zero 116 vld1.8 {$in0},[$inp],#16 117 mov $bits,#8 // reuse $bits 118 vld1.32 {$rcon,$mask},[$ptr],#32 119 120 b.lt .Loop128 121 b.eq .L192 122 b .L256 123 124.align 4 125.Loop128: 126 vtbl.8 $key,{$in0},$mask 127 vext.8 $tmp,$zero,$in0,#12 128 vst1.32 {$in0},[$out],#16 129 aese $key,$zero 130 subs $bits,$bits,#1 131 132 veor $in0,$in0,$tmp 133 vext.8 $tmp,$zero,$tmp,#12 134 veor $in0,$in0,$tmp 135 vext.8 $tmp,$zero,$tmp,#12 136 veor $key,$key,$rcon 137 veor $in0,$in0,$tmp 138 vshl.u8 $rcon,$rcon,#1 139 veor $in0,$in0,$key 140 b.ne .Loop128 141 142 vld1.32 {$rcon},[$ptr] 143 144 vtbl.8 $key,{$in0},$mask 145 vext.8 $tmp,$zero,$in0,#12 146 vst1.32 {$in0},[$out],#16 147 aese $key,$zero 148 149 veor $in0,$in0,$tmp 150 vext.8 $tmp,$zero,$tmp,#12 151 veor $in0,$in0,$tmp 152 vext.8 $tmp,$zero,$tmp,#12 153 veor $key,$key,$rcon 154 veor $in0,$in0,$tmp 155 vshl.u8 $rcon,$rcon,#1 156 veor $in0,$in0,$key 157 158 vtbl.8 $key,{$in0},$mask 159 vext.8 $tmp,$zero,$in0,#12 160 vst1.32 {$in0},[$out],#16 161 aese $key,$zero 162 163 veor $in0,$in0,$tmp 164 vext.8 $tmp,$zero,$tmp,#12 165 veor $in0,$in0,$tmp 166 vext.8 $tmp,$zero,$tmp,#12 167 veor $key,$key,$rcon 168 veor $in0,$in0,$tmp 169 veor $in0,$in0,$key 170 vst1.32 {$in0},[$out] 171 add $out,$out,#0x50 172 173 mov $rounds,#10 174 b .Ldone 175 176.align 4 177.L192: 178 vld1.8 {$in1},[$inp],#8 179 vmov.i8 $key,#8 // borrow $key 180 vst1.32 {$in0},[$out],#16 181 vsub.i8 $mask,$mask,$key // adjust the mask 182 183.Loop192: 184 vtbl.8 $key,{$in1},$mask 185 vext.8 $tmp,$zero,$in0,#12 186 vst1.32 {$in1},[$out],#8 187 aese $key,$zero 188 subs $bits,$bits,#1 189 190 veor $in0,$in0,$tmp 191 vext.8 $tmp,$zero,$tmp,#12 192 veor $in0,$in0,$tmp 193 vext.8 $tmp,$zero,$tmp,#12 194 veor $in0,$in0,$tmp 195 196 vdup.32 $tmp,${in0}[3] 197 veor $tmp,$tmp,$in1 198 veor $key,$key,$rcon 199 vext.8 $in1,$zero,$in1,#12 200 vshl.u8 $rcon,$rcon,#1 201 veor $in1,$in1,$tmp 202 veor $in0,$in0,$key 203 veor $in1,$in1,$key 204 vst1.32 {$in0},[$out],#16 205 b.ne .Loop192 206 207 mov $rounds,#12 208 add $out,$out,#0x20 209 b .Ldone 210 211.align 4 212.L256: 213 vld1.8 {$in1},[$inp] 214 mov $bits,#7 215 mov $rounds,#14 216 vst1.32 {$in0},[$out],#16 217 218.Loop256: 219 vtbl.8 $key,{$in1},$mask 220 vext.8 $tmp,$zero,$in0,#12 221 vst1.32 {$in1},[$out],#16 222 aese $key,$zero 223 subs $bits,$bits,#1 224 225 veor $in0,$in0,$tmp 226 vext.8 $tmp,$zero,$tmp,#12 227 veor $in0,$in0,$tmp 228 vext.8 $tmp,$zero,$tmp,#12 229 veor $key,$key,$rcon 230 veor $in0,$in0,$tmp 231 vshl.u8 $rcon,$rcon,#1 232 veor $in0,$in0,$key 233 vst1.32 {$in0},[$out],#16 234 b.eq .Ldone 235 236 vdup.32 $key,${in0}[3] // just splat 237 vext.8 $tmp,$zero,$in1,#12 238 aese $key,$zero 239 240 veor $in1,$in1,$tmp 241 vext.8 $tmp,$zero,$tmp,#12 242 veor $in1,$in1,$tmp 243 vext.8 $tmp,$zero,$tmp,#12 244 veor $in1,$in1,$tmp 245 246 veor $in1,$in1,$key 247 b .Loop256 248 249.Ldone: 250 str $rounds,[$out] 251 mov $ptr,#0 252 253.Lenc_key_abort: 254 mov x0,$ptr // return value 255 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 256 ret 257.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 258 259.globl ${prefix}_set_decrypt_key 260.type ${prefix}_set_decrypt_key,%function 261.align 5 262${prefix}_set_decrypt_key: 263___ 264$code.=<<___ if ($flavour =~ /64/); 265 stp x29,x30,[sp,#-16]! 266 add x29,sp,#0 267___ 268$code.=<<___ if ($flavour !~ /64/); 269 stmdb sp!,{r4,lr} 270___ 271$code.=<<___; 272 bl .Lenc_key 273 274 cmp x0,#0 275 b.ne .Ldec_key_abort 276 277 sub $out,$out,#240 // restore original $out 278 mov x4,#-16 279 add $inp,$out,x12,lsl#4 // end of key schedule 280 281 vld1.32 {v0.16b},[$out] 282 vld1.32 {v1.16b},[$inp] 283 vst1.32 {v0.16b},[$inp],x4 284 vst1.32 {v1.16b},[$out],#16 285 286.Loop_imc: 287 vld1.32 {v0.16b},[$out] 288 vld1.32 {v1.16b},[$inp] 289 aesimc v0.16b,v0.16b 290 aesimc v1.16b,v1.16b 291 vst1.32 {v0.16b},[$inp],x4 292 vst1.32 {v1.16b},[$out],#16 293 cmp $inp,$out 294 b.hi .Loop_imc 295 296 vld1.32 {v0.16b},[$out] 297 aesimc v0.16b,v0.16b 298 vst1.32 {v0.16b},[$inp] 299 300 eor x0,x0,x0 // return value 301.Ldec_key_abort: 302___ 303$code.=<<___ if ($flavour !~ /64/); 304 ldmia sp!,{r4,pc} 305___ 306$code.=<<___ if ($flavour =~ /64/); 307 ldp x29,x30,[sp],#16 308 ret 309___ 310$code.=<<___; 311.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 312___ 313}}} 314{{{ 315sub gen_block () { 316my $dir = shift; 317my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 318my ($inp,$out,$key)=map("x$_",(0..2)); 319my $rounds="w3"; 320my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 321 322$code.=<<___; 323.globl ${prefix}_${dir}crypt 324.type ${prefix}_${dir}crypt,%function 325.align 5 326${prefix}_${dir}crypt: 327 ldr $rounds,[$key,#240] 328 vld1.32 {$rndkey0},[$key],#16 329 vld1.8 {$inout},[$inp] 330 sub $rounds,$rounds,#2 331 vld1.32 {$rndkey1},[$key],#16 332 333.Loop_${dir}c: 334 aes$e $inout,$rndkey0 335 aes$mc $inout,$inout 336 vld1.32 {$rndkey0},[$key],#16 337 subs $rounds,$rounds,#2 338 aes$e $inout,$rndkey1 339 aes$mc $inout,$inout 340 vld1.32 {$rndkey1},[$key],#16 341 b.gt .Loop_${dir}c 342 343 aes$e $inout,$rndkey0 344 aes$mc $inout,$inout 345 vld1.32 {$rndkey0},[$key] 346 aes$e $inout,$rndkey1 347 veor $inout,$inout,$rndkey0 348 349 vst1.8 {$inout},[$out] 350 ret 351.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 352___ 353} 354&gen_block("en"); 355&gen_block("de"); 356}}} 357{{{ 358my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 359my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 360my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 361 362my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 363my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 364 365### q8-q15 preloaded key schedule 366 367$code.=<<___; 368.globl ${prefix}_cbc_encrypt 369.type ${prefix}_cbc_encrypt,%function 370.align 5 371${prefix}_cbc_encrypt: 372___ 373$code.=<<___ if ($flavour =~ /64/); 374 stp x29,x30,[sp,#-16]! 375 add x29,sp,#0 376___ 377$code.=<<___ if ($flavour !~ /64/); 378 mov ip,sp 379 stmdb sp!,{r4-r8,lr} 380 vstmdb sp!,{d8-d15} @ ABI specification says so 381 ldmia ip,{r4-r5} @ load remaining args 382___ 383$code.=<<___; 384 subs $len,$len,#16 385 mov $step,#16 386 b.lo .Lcbc_abort 387 cclr $step,eq 388 389 cmp $enc,#0 // en- or decrypting? 390 ldr $rounds,[$key,#240] 391 and $len,$len,#-16 392 vld1.8 {$ivec},[$ivp] 393 vld1.8 {$dat},[$inp],$step 394 395 vld1.32 {q8-q9},[$key] // load key schedule... 396 sub $rounds,$rounds,#6 397 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 398 sub $rounds,$rounds,#2 399 vld1.32 {q10-q11},[$key_],#32 400 vld1.32 {q12-q13},[$key_],#32 401 vld1.32 {q14-q15},[$key_],#32 402 vld1.32 {$rndlast},[$key_] 403 404 add $key_,$key,#32 405 mov $cnt,$rounds 406 b.eq .Lcbc_dec 407 408 cmp $rounds,#2 409 veor $dat,$dat,$ivec 410 veor $rndzero_n_last,q8,$rndlast 411 b.eq .Lcbc_enc128 412 413 vld1.32 {$in0-$in1},[$key_] 414 add $key_,$key,#16 415 add $key4,$key,#16*4 416 add $key5,$key,#16*5 417 aese $dat,q8 418 aesmc $dat,$dat 419 add $key6,$key,#16*6 420 add $key7,$key,#16*7 421 b .Lenter_cbc_enc 422 423.align 4 424.Loop_cbc_enc: 425 aese $dat,q8 426 aesmc $dat,$dat 427 vst1.8 {$ivec},[$out],#16 428.Lenter_cbc_enc: 429 aese $dat,q9 430 aesmc $dat,$dat 431 aese $dat,$in0 432 aesmc $dat,$dat 433 vld1.32 {q8},[$key4] 434 cmp $rounds,#4 435 aese $dat,$in1 436 aesmc $dat,$dat 437 vld1.32 {q9},[$key5] 438 b.eq .Lcbc_enc192 439 440 aese $dat,q8 441 aesmc $dat,$dat 442 vld1.32 {q8},[$key6] 443 aese $dat,q9 444 aesmc $dat,$dat 445 vld1.32 {q9},[$key7] 446 nop 447 448.Lcbc_enc192: 449 aese $dat,q8 450 aesmc $dat,$dat 451 subs $len,$len,#16 452 aese $dat,q9 453 aesmc $dat,$dat 454 cclr $step,eq 455 aese $dat,q10 456 aesmc $dat,$dat 457 aese $dat,q11 458 aesmc $dat,$dat 459 vld1.8 {q8},[$inp],$step 460 aese $dat,q12 461 aesmc $dat,$dat 462 veor q8,q8,$rndzero_n_last 463 aese $dat,q13 464 aesmc $dat,$dat 465 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 466 aese $dat,q14 467 aesmc $dat,$dat 468 aese $dat,q15 469 veor $ivec,$dat,$rndlast 470 b.hs .Loop_cbc_enc 471 472 vst1.8 {$ivec},[$out],#16 473 b .Lcbc_done 474 475.align 5 476.Lcbc_enc128: 477 vld1.32 {$in0-$in1},[$key_] 478 aese $dat,q8 479 aesmc $dat,$dat 480 b .Lenter_cbc_enc128 481.Loop_cbc_enc128: 482 aese $dat,q8 483 aesmc $dat,$dat 484 vst1.8 {$ivec},[$out],#16 485.Lenter_cbc_enc128: 486 aese $dat,q9 487 aesmc $dat,$dat 488 subs $len,$len,#16 489 aese $dat,$in0 490 aesmc $dat,$dat 491 cclr $step,eq 492 aese $dat,$in1 493 aesmc $dat,$dat 494 aese $dat,q10 495 aesmc $dat,$dat 496 aese $dat,q11 497 aesmc $dat,$dat 498 vld1.8 {q8},[$inp],$step 499 aese $dat,q12 500 aesmc $dat,$dat 501 aese $dat,q13 502 aesmc $dat,$dat 503 aese $dat,q14 504 aesmc $dat,$dat 505 veor q8,q8,$rndzero_n_last 506 aese $dat,q15 507 veor $ivec,$dat,$rndlast 508 b.hs .Loop_cbc_enc128 509 510 vst1.8 {$ivec},[$out],#16 511 b .Lcbc_done 512___ 513{ 514my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 515$code.=<<___; 516.align 5 517.Lcbc_dec: 518 vld1.8 {$dat2},[$inp],#16 519 subs $len,$len,#32 // bias 520 add $cnt,$rounds,#2 521 vorr $in1,$dat,$dat 522 vorr $dat1,$dat,$dat 523 vorr $in2,$dat2,$dat2 524 b.lo .Lcbc_dec_tail 525 526 vorr $dat1,$dat2,$dat2 527 vld1.8 {$dat2},[$inp],#16 528 vorr $in0,$dat,$dat 529 vorr $in1,$dat1,$dat1 530 vorr $in2,$dat2,$dat2 531 532.Loop3x_cbc_dec: 533 aesd $dat0,q8 534 aesimc $dat0,$dat0 535 aesd $dat1,q8 536 aesimc $dat1,$dat1 537 aesd $dat2,q8 538 aesimc $dat2,$dat2 539 vld1.32 {q8},[$key_],#16 540 subs $cnt,$cnt,#2 541 aesd $dat0,q9 542 aesimc $dat0,$dat0 543 aesd $dat1,q9 544 aesimc $dat1,$dat1 545 aesd $dat2,q9 546 aesimc $dat2,$dat2 547 vld1.32 {q9},[$key_],#16 548 b.gt .Loop3x_cbc_dec 549 550 aesd $dat0,q8 551 aesimc $dat0,$dat0 552 aesd $dat1,q8 553 aesimc $dat1,$dat1 554 aesd $dat2,q8 555 aesimc $dat2,$dat2 556 veor $tmp0,$ivec,$rndlast 557 subs $len,$len,#0x30 558 veor $tmp1,$in0,$rndlast 559 mov.lo x6,$len // x6, $cnt, is zero at this point 560 aesd $dat0,q9 561 aesimc $dat0,$dat0 562 aesd $dat1,q9 563 aesimc $dat1,$dat1 564 aesd $dat2,q9 565 aesimc $dat2,$dat2 566 veor $tmp2,$in1,$rndlast 567 add $inp,$inp,x6 // $inp is adjusted in such way that 568 // at exit from the loop $dat1-$dat2 569 // are loaded with last "words" 570 vorr $ivec,$in2,$in2 571 mov $key_,$key 572 aesd $dat0,q12 573 aesimc $dat0,$dat0 574 aesd $dat1,q12 575 aesimc $dat1,$dat1 576 aesd $dat2,q12 577 aesimc $dat2,$dat2 578 vld1.8 {$in0},[$inp],#16 579 aesd $dat0,q13 580 aesimc $dat0,$dat0 581 aesd $dat1,q13 582 aesimc $dat1,$dat1 583 aesd $dat2,q13 584 aesimc $dat2,$dat2 585 vld1.8 {$in1},[$inp],#16 586 aesd $dat0,q14 587 aesimc $dat0,$dat0 588 aesd $dat1,q14 589 aesimc $dat1,$dat1 590 aesd $dat2,q14 591 aesimc $dat2,$dat2 592 vld1.8 {$in2},[$inp],#16 593 aesd $dat0,q15 594 aesd $dat1,q15 595 aesd $dat2,q15 596 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 597 add $cnt,$rounds,#2 598 veor $tmp0,$tmp0,$dat0 599 veor $tmp1,$tmp1,$dat1 600 veor $dat2,$dat2,$tmp2 601 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 602 vst1.8 {$tmp0},[$out],#16 603 vorr $dat0,$in0,$in0 604 vst1.8 {$tmp1},[$out],#16 605 vorr $dat1,$in1,$in1 606 vst1.8 {$dat2},[$out],#16 607 vorr $dat2,$in2,$in2 608 b.hs .Loop3x_cbc_dec 609 610 cmn $len,#0x30 611 b.eq .Lcbc_done 612 nop 613 614.Lcbc_dec_tail: 615 aesd $dat1,q8 616 aesimc $dat1,$dat1 617 aesd $dat2,q8 618 aesimc $dat2,$dat2 619 vld1.32 {q8},[$key_],#16 620 subs $cnt,$cnt,#2 621 aesd $dat1,q9 622 aesimc $dat1,$dat1 623 aesd $dat2,q9 624 aesimc $dat2,$dat2 625 vld1.32 {q9},[$key_],#16 626 b.gt .Lcbc_dec_tail 627 628 aesd $dat1,q8 629 aesimc $dat1,$dat1 630 aesd $dat2,q8 631 aesimc $dat2,$dat2 632 aesd $dat1,q9 633 aesimc $dat1,$dat1 634 aesd $dat2,q9 635 aesimc $dat2,$dat2 636 aesd $dat1,q12 637 aesimc $dat1,$dat1 638 aesd $dat2,q12 639 aesimc $dat2,$dat2 640 cmn $len,#0x20 641 aesd $dat1,q13 642 aesimc $dat1,$dat1 643 aesd $dat2,q13 644 aesimc $dat2,$dat2 645 veor $tmp1,$ivec,$rndlast 646 aesd $dat1,q14 647 aesimc $dat1,$dat1 648 aesd $dat2,q14 649 aesimc $dat2,$dat2 650 veor $tmp2,$in1,$rndlast 651 aesd $dat1,q15 652 aesd $dat2,q15 653 b.eq .Lcbc_dec_one 654 veor $tmp1,$tmp1,$dat1 655 veor $tmp2,$tmp2,$dat2 656 vorr $ivec,$in2,$in2 657 vst1.8 {$tmp1},[$out],#16 658 vst1.8 {$tmp2},[$out],#16 659 b .Lcbc_done 660 661.Lcbc_dec_one: 662 veor $tmp1,$tmp1,$dat2 663 vorr $ivec,$in2,$in2 664 vst1.8 {$tmp1},[$out],#16 665 666.Lcbc_done: 667 vst1.8 {$ivec},[$ivp] 668.Lcbc_abort: 669___ 670} 671$code.=<<___ if ($flavour !~ /64/); 672 vldmia sp!,{d8-d15} 673 ldmia sp!,{r4-r8,pc} 674___ 675$code.=<<___ if ($flavour =~ /64/); 676 ldr x29,[sp],#16 677 ret 678___ 679$code.=<<___; 680.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 681___ 682}}} 683{{{ 684my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 685my ($rounds,$cnt,$key_)=("w5","w6","x7"); 686my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 687my $step="x12"; # aliases with $tctr2 688 689my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 690my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 691 692my ($dat,$tmp)=($dat0,$tmp0); 693 694### q8-q15 preloaded key schedule 695 696$code.=<<___; 697.globl ${prefix}_ctr32_encrypt_blocks 698.type ${prefix}_ctr32_encrypt_blocks,%function 699.align 5 700${prefix}_ctr32_encrypt_blocks: 701___ 702$code.=<<___ if ($flavour =~ /64/); 703 stp x29,x30,[sp,#-16]! 704 add x29,sp,#0 705___ 706$code.=<<___ if ($flavour !~ /64/); 707 mov ip,sp 708 stmdb sp!,{r4-r10,lr} 709 vstmdb sp!,{d8-d15} @ ABI specification says so 710 ldr r4, [ip] @ load remaining arg 711___ 712$code.=<<___; 713 ldr $rounds,[$key,#240] 714 715 ldr $ctr, [$ivp, #12] 716 vld1.32 {$dat0},[$ivp] 717 718 vld1.32 {q8-q9},[$key] // load key schedule... 719 sub $rounds,$rounds,#4 720 mov $step,#16 721 cmp $len,#2 722 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 723 sub $rounds,$rounds,#2 724 vld1.32 {q12-q13},[$key_],#32 725 vld1.32 {q14-q15},[$key_],#32 726 vld1.32 {$rndlast},[$key_] 727 add $key_,$key,#32 728 mov $cnt,$rounds 729 cclr $step,lo 730#ifndef __ARMEB__ 731 rev $ctr, $ctr 732#endif 733 vorr $dat1,$dat0,$dat0 734 add $tctr1, $ctr, #1 735 vorr $dat2,$dat0,$dat0 736 add $ctr, $ctr, #2 737 vorr $ivec,$dat0,$dat0 738 rev $tctr1, $tctr1 739 vmov.32 ${dat1}[3],$tctr1 740 b.ls .Lctr32_tail 741 rev $tctr2, $ctr 742 sub $len,$len,#3 // bias 743 vmov.32 ${dat2}[3],$tctr2 744 b .Loop3x_ctr32 745 746.align 4 747.Loop3x_ctr32: 748 aese $dat0,q8 749 aesmc $dat0,$dat0 750 aese $dat1,q8 751 aesmc $dat1,$dat1 752 aese $dat2,q8 753 aesmc $dat2,$dat2 754 vld1.32 {q8},[$key_],#16 755 subs $cnt,$cnt,#2 756 aese $dat0,q9 757 aesmc $dat0,$dat0 758 aese $dat1,q9 759 aesmc $dat1,$dat1 760 aese $dat2,q9 761 aesmc $dat2,$dat2 762 vld1.32 {q9},[$key_],#16 763 b.gt .Loop3x_ctr32 764 765 aese $dat0,q8 766 aesmc $tmp0,$dat0 767 aese $dat1,q8 768 aesmc $tmp1,$dat1 769 vld1.8 {$in0},[$inp],#16 770 vorr $dat0,$ivec,$ivec 771 aese $dat2,q8 772 aesmc $dat2,$dat2 773 vld1.8 {$in1},[$inp],#16 774 vorr $dat1,$ivec,$ivec 775 aese $tmp0,q9 776 aesmc $tmp0,$tmp0 777 aese $tmp1,q9 778 aesmc $tmp1,$tmp1 779 vld1.8 {$in2},[$inp],#16 780 mov $key_,$key 781 aese $dat2,q9 782 aesmc $tmp2,$dat2 783 vorr $dat2,$ivec,$ivec 784 add $tctr0,$ctr,#1 785 aese $tmp0,q12 786 aesmc $tmp0,$tmp0 787 aese $tmp1,q12 788 aesmc $tmp1,$tmp1 789 veor $in0,$in0,$rndlast 790 add $tctr1,$ctr,#2 791 aese $tmp2,q12 792 aesmc $tmp2,$tmp2 793 veor $in1,$in1,$rndlast 794 add $ctr,$ctr,#3 795 aese $tmp0,q13 796 aesmc $tmp0,$tmp0 797 aese $tmp1,q13 798 aesmc $tmp1,$tmp1 799 veor $in2,$in2,$rndlast 800 rev $tctr0,$tctr0 801 aese $tmp2,q13 802 aesmc $tmp2,$tmp2 803 vmov.32 ${dat0}[3], $tctr0 804 rev $tctr1,$tctr1 805 aese $tmp0,q14 806 aesmc $tmp0,$tmp0 807 aese $tmp1,q14 808 aesmc $tmp1,$tmp1 809 vmov.32 ${dat1}[3], $tctr1 810 rev $tctr2,$ctr 811 aese $tmp2,q14 812 aesmc $tmp2,$tmp2 813 vmov.32 ${dat2}[3], $tctr2 814 subs $len,$len,#3 815 aese $tmp0,q15 816 aese $tmp1,q15 817 aese $tmp2,q15 818 819 veor $in0,$in0,$tmp0 820 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 821 vst1.8 {$in0},[$out],#16 822 veor $in1,$in1,$tmp1 823 mov $cnt,$rounds 824 vst1.8 {$in1},[$out],#16 825 veor $in2,$in2,$tmp2 826 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 827 vst1.8 {$in2},[$out],#16 828 b.hs .Loop3x_ctr32 829 830 adds $len,$len,#3 831 b.eq .Lctr32_done 832 cmp $len,#1 833 mov $step,#16 834 cclr $step,eq 835 836.Lctr32_tail: 837 aese $dat0,q8 838 aesmc $dat0,$dat0 839 aese $dat1,q8 840 aesmc $dat1,$dat1 841 vld1.32 {q8},[$key_],#16 842 subs $cnt,$cnt,#2 843 aese $dat0,q9 844 aesmc $dat0,$dat0 845 aese $dat1,q9 846 aesmc $dat1,$dat1 847 vld1.32 {q9},[$key_],#16 848 b.gt .Lctr32_tail 849 850 aese $dat0,q8 851 aesmc $dat0,$dat0 852 aese $dat1,q8 853 aesmc $dat1,$dat1 854 aese $dat0,q9 855 aesmc $dat0,$dat0 856 aese $dat1,q9 857 aesmc $dat1,$dat1 858 vld1.8 {$in0},[$inp],$step 859 aese $dat0,q12 860 aesmc $dat0,$dat0 861 aese $dat1,q12 862 aesmc $dat1,$dat1 863 vld1.8 {$in1},[$inp] 864 aese $dat0,q13 865 aesmc $dat0,$dat0 866 aese $dat1,q13 867 aesmc $dat1,$dat1 868 veor $in0,$in0,$rndlast 869 aese $dat0,q14 870 aesmc $dat0,$dat0 871 aese $dat1,q14 872 aesmc $dat1,$dat1 873 veor $in1,$in1,$rndlast 874 aese $dat0,q15 875 aese $dat1,q15 876 877 cmp $len,#1 878 veor $in0,$in0,$dat0 879 veor $in1,$in1,$dat1 880 vst1.8 {$in0},[$out],#16 881 b.eq .Lctr32_done 882 vst1.8 {$in1},[$out] 883 884.Lctr32_done: 885___ 886$code.=<<___ if ($flavour !~ /64/); 887 vldmia sp!,{d8-d15} 888 ldmia sp!,{r4-r10,pc} 889___ 890$code.=<<___ if ($flavour =~ /64/); 891 ldr x29,[sp],#16 892 ret 893___ 894$code.=<<___; 895.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 896___ 897}}} 898$code.=<<___; 899#endif 900___ 901######################################## 902if ($flavour =~ /64/) { ######## 64-bit code 903 my %opcode = ( 904 "aesd" => 0x4e285800, "aese" => 0x4e284800, 905 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 906 907 local *unaes = sub { 908 my ($mnemonic,$arg)=@_; 909 910 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 911 sprintf ".inst\t0x%08x\t//%s %s", 912 $opcode{$mnemonic}|$1|($2<<5), 913 $mnemonic,$arg; 914 }; 915 916 foreach(split("\n",$code)) { 917 s/\`([^\`]*)\`/eval($1)/geo; 918 919 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 920 s/@\s/\/\//o; # old->new style commentary 921 922 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 923 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 924 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 925 s/vmov\.i8/movi/o or # fix up legacy mnemonics 926 s/vext\.8/ext/o or 927 s/vrev32\.8/rev32/o or 928 s/vtst\.8/cmtst/o or 929 s/vshr/ushr/o or 930 s/^(\s+)v/$1/o or # strip off v prefix 931 s/\bbx\s+lr\b/ret/o; 932 933 # fix up remaining legacy suffixes 934 s/\.[ui]?8//o; 935 m/\],#8/o and s/\.16b/\.8b/go; 936 s/\.[ui]?32//o and s/\.16b/\.4s/go; 937 s/\.[ui]?64//o and s/\.16b/\.2d/go; 938 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 939 940 print $_,"\n"; 941 } 942} else { ######## 32-bit code 943 my %opcode = ( 944 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 945 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 946 947 local *unaes = sub { 948 my ($mnemonic,$arg)=@_; 949 950 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 951 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 952 |(($2&7)<<1) |(($2&8)<<2); 953 # since ARMv7 instructions are always encoded little-endian. 954 # correct solution is to use .inst directive, but older 955 # assemblers don't implement it:-( 956 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 957 $word&0xff,($word>>8)&0xff, 958 ($word>>16)&0xff,($word>>24)&0xff, 959 $mnemonic,$arg; 960 } 961 }; 962 963 sub unvtbl { 964 my $arg=shift; 965 966 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 967 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 968 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 969 } 970 971 sub unvdup32 { 972 my $arg=shift; 973 974 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 975 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 976 } 977 978 sub unvmov32 { 979 my $arg=shift; 980 981 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 982 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 983 } 984 985 foreach(split("\n",$code)) { 986 s/\`([^\`]*)\`/eval($1)/geo; 987 988 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 989 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 990 s/\/\/\s?/@ /o; # new->old style commentary 991 992 # fix up remaining new-style suffixes 993 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 994 s/\],#[0-9]+/]!/o; 995 996 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 997 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 998 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 999 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 1000 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1001 s/^(\s+)b\./$1b/o or 1002 s/^(\s+)mov\./$1mov/o or 1003 s/^(\s+)ret/$1bx\tlr/o; 1004 1005 print $_,"\n"; 1006 } 1007} 1008 1009close STDOUT; 1010