1#! /usr/bin/env perl 2# Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10#======================================================================== 11# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project, 12# derived from https://github.com/ARM-software/AArch64cryptolib, original 13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual 14# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you 15# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/. 16#======================================================================== 17# 18# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants 19# 20# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks 21# 22# ____________________________________________________ 23# | | 24# | PRE | 25# |____________________________________________________| 26# | | | | 27# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | 28# |________________|________________|__________________| 29# | | | | 30# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | 31# |________________|________________|__________________| 32# | | | | 33# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | 34# |________________|________________|__________________| 35# | | | | 36# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | 37# |________________|____(mostly)____|__________________| 38# | | 39# | MODULO | 40# |____________________________________________________| 41# 42# PRE: 43# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0 44# EXT low_acc, low_acc, low_acc, #8 45# EOR res_curr (4k+0), res_curr (4k+0), low_acc 46# 47# CTR block: 48# Increment and byte reverse counter in scalar registers and transfer to SIMD registers 49# REV ctr32, rev_ctr32 50# ORR ctr64, constctr96_top32, ctr32, LSL #32 51# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF 52# INS ctr_next.d[1], ctr64X 53# ADD rev_ctr32, #1 54# 55# AES block: 56# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example. 57# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring 58# Given we are very constrained in our ASIMD registers this is quite important 59# 60# Encrypt: 61# LDR input_low, [ input_ptr ], #8 62# LDR input_high, [ input_ptr ], #8 63# EOR input_low, k14_low 64# EOR input_high, k14_high 65# INS res_curr.d[0], input_low 66# INS res_curr.d[1], input_high 67# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr 68# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr 69# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr 70# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr 71# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr 72# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr 73# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr 74# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr 75# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr 76# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr 77# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr 78# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr 79# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr 80# AESE ctr_curr, k13 81# EOR res_curr, res_curr, ctr_curr 82# ST1 { res_curr.16b }, [ output_ptr ], #16 83# 84# Decrypt: 85# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr 86# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr 87# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr 88# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr 89# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr 90# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr 91# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr 92# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr 93# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr 94# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr 95# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr 96# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr 97# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr 98# AESE ctr_curr, k13 99# LDR res_curr, [ input_ptr ], #16 100# EOR res_curr, res_curr, ctr_curr 101# MOV output_low, res_curr.d[0] 102# MOV output_high, res_curr.d[1] 103# EOR output_low, k14_low 104# EOR output_high, k14_high 105# STP output_low, output_high, [ output_ptr ], #16 106# 107# GHASH block X: 108# do 128b karatsuba polynomial multiplication on block 109# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b 110# 111# multiplication: 112# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 113# 114# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies: 115# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64 116# 117# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are 118# multiplying with "twisted" powers of H 119# 120# Note: We can PMULL directly into the acc_x in first GHASH of the loop 121# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical 122# path latency dominates the performance 123# 124# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers 125# than indicated here 126# REV64 res_curr, res_curr 127# INS t_m.d[0], res_curr.d[1] 128# EOR t_m.8B, t_m.8B, res_curr.8B 129# PMULL2 t_h, res_curr, HX 130# PMULL t_l, res_curr, HX 131# PMULL t_m, t_m, HX_k 132# EOR acc_h, acc_h, t_h 133# EOR acc_l, acc_l, t_l 134# EOR acc_m, acc_m, t_m 135# 136# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them 137# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo 138# with a reversed constant 139# EOR acc_m, acc_m, acc_h 140# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing 141# PMULL t_mod, acc_h, mod_constant 142# EXT acc_h, acc_h, acc_h, #8 143# EOR acc_m, acc_m, acc_h 144# EOR acc_m, acc_m, t_mod 145# PMULL acc_h, acc_m, mod_constant 146# EXT acc_m, acc_m, acc_m, #8 147# EOR acc_l, acc_l, acc_h 148# EOR acc_l, acc_l, acc_m 149 150$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 151$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 152 153$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 154( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 155( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or 156die "can't locate arm-xlate.pl"; 157 158open OUT,"| \"$^X\" $xlate $flavour $output"; 159*STDOUT=*OUT; 160 161$input_ptr="x0"; #argument block 162$bit_length="x1"; 163$output_ptr="x2"; 164$current_tag="x3"; 165$counter="x16"; 166$cc="x8"; 167 168{ 169my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); 170my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); 171my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); 172my ($output_l0,$output_h0)=map("x$_",(6..7)); 173 174my $ctr32w="w9"; 175my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15)); 176my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); 177 178my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); 179my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); 180my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); 181my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); 182 183my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); 184my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); 185my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); 186 187my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); 188my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); 189my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); 190 191my $t0="v8"; 192my $t0d="d8"; 193 194my ($t1,$t2,$t3)=map("v$_",(28..30)); 195my ($t1d,$t2d,$t3d)=map("d$_",(28..30)); 196 197my $t4="v8"; 198my $t4d="d8"; 199my $t5="v28"; 200my $t5d="d28"; 201my $t6="v31"; 202my $t6d="d31"; 203 204my $t7="v4"; 205my $t7d="d4"; 206my $t8="v29"; 207my $t8d="d29"; 208my $t9="v30"; 209my $t9d="d30"; 210 211my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); 212my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); 213my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); 214 215my $mod_constantd="d8"; 216my $mod_constant="v8"; 217my $mod_t="v31"; 218 219my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27)); 220my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27)); 221my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27)); 222my $rk2q1="v20.1q"; 223my $rk3q1="v21.1q"; 224my $rk4v="v22"; 225my $rk4d="d22"; 226 227$code=<<___; 228#include "arm_arch.h" 229 230#if __ARM_MAX_ARCH__>=8 231___ 232$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 233$code.=<<___ if ($flavour !~ /64/); 234.fpu neon 235#ifdef __thumb2__ 236.syntax unified 237.thumb 238# define INST(a,b,c,d) $_byte c,0xef,a,b 239#else 240.code 32 241# define INST(a,b,c,d) $_byte a,b,c,0xf2 242#endif 243 244.text 245___ 246 247######################################################################################### 248# size_t aes_gcm_enc_128_kernel(const unsigned char *in, 249# size_t len, 250# unsigned char *out, 251# const void *key, 252# unsigned char ivec[16], 253# u64 *Xi); 254# 255$code.=<<___; 256.global aes_gcm_enc_128_kernel 257.type aes_gcm_enc_128_kernel,%function 258.align 4 259aes_gcm_enc_128_kernel: 260 cbz x1, .L128_enc_ret 261 stp x19, x20, [sp, #-112]! 262 mov x16, x4 263 mov x8, x5 264 stp x21, x22, [sp, #16] 265 stp x23, x24, [sp, #32] 266 stp d8, d9, [sp, #48] 267 stp d10, d11, [sp, #64] 268 stp d12, d13, [sp, #80] 269 stp d14, d15, [sp, #96] 270 271 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 272#ifdef __AARCH64EB__ 273 rev $ctr96_b64x, $ctr96_b64x 274 rev $ctr96_t32x, $ctr96_t32x 275#endif 276 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 277#ifdef __AARCH64EB__ 278 ror $rk10_l, $rk10_l, #32 279 ror $rk10_h, $rk10_h, #32 280#endif 281 ld1 {$acc_lb}, [$current_tag] 282 ext $acc_lb, $acc_lb, $acc_lb, #8 283 rev64 $acc_lb, $acc_lb 284 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 285 mov $len, $main_end_input_ptr 286 287 ld1 {$rk0s}, [$cc], #16 @ load rk0 288 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 289 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 290 291 lsr $rctr32x, $ctr96_t32x, #32 292 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 293#ifndef __AARCH64EB__ 294 ext $h4b, $h4b, $h4b, #8 295#endif 296 fmov $ctr1d, $ctr96_b64x @ CTR block 1 297 rev $rctr32w, $rctr32w @ rev_ctr32 298 299 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 300 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 301 ld1 {$rk1s}, [$cc], #16 @ load rk1 302 303 rev $ctr32w, $rctr32w @ CTR block 1 304 add $rctr32w, $rctr32w, #1 @ CTR block 1 305 fmov $ctr3d, $ctr96_b64x @ CTR block 3 306 307 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 308 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 309 310 fmov $ctr1.d[1], $ctr32x @ CTR block 1 311 rev $ctr32w, $rctr32w @ CTR block 2 312 313 fmov $ctr2d, $ctr96_b64x @ CTR block 2 314 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 315 add $rctr32w, $rctr32w, #1 @ CTR block 2 316 317 fmov $ctr2.d[1], $ctr32x @ CTR block 2 318 rev $ctr32w, $rctr32w @ CTR block 3 319 320 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 321 ld1 {$rk2s}, [$cc], #16 @ load rk2 322 323 add $rctr32w, $rctr32w, #1 @ CTR block 3 324 fmov $ctr3.d[1], $ctr32x @ CTR block 3 325 326 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 327#ifndef __AARCH64EB__ 328 ext $h3b, $h3b, $h3b, #8 329#endif 330 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 331 ld1 {$rk3s}, [$cc], #16 @ load rk3 332 333 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 334 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 335#ifndef __AARCH64EB__ 336 ext $h1b, $h1b, $h1b, #8 337#endif 338 339 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 340 ld1 {$rk4s}, [$cc], #16 @ load rk4 341 342 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 343 ld1 {$rk5s}, [$cc], #16 @ load rk5 344 345 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 346 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 347 348 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 349 ld1 {$rk6s}, [$cc], #16 @ load rk6 350 351 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 352 ld1 {$rk7s}, [$cc], #16 @ load rk7 353 354 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 355 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 356 357 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 358 ld1 {$rk8s}, [$cc], #16 @ load rk8 359 360 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 361 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 362#ifndef __AARCH64EB__ 363 ext $h2b, $h2b, $h2b, #8 364#endif 365 366 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 367 368 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 369 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 370 371 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 372 373 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 374 375 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 376 ld1 {$rk9s}, [$cc], #16 @ load rk9 377 378 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 379 380 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 381 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 382 383 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 384 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 385 386 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 387 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 388 389 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 390 391 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 392 393 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 394 395 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 396 397 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 398 399 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 400 401 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 402 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 403 404 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 405 406 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 407 408 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 409 410 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 411 412 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 413 414 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 415 416 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 417 418 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 419 420 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 421 422 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 423 424 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 425 426 aese $ctr2b, $rk9 @ AES block 2 - round 9 427 428 aese $ctr0b, $rk9 @ AES block 0 - round 9 429 430 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 431 432 aese $ctr1b, $rk9 @ AES block 1 - round 9 433 434 aese $ctr3b, $rk9 @ AES block 3 - round 9 435 b.ge .L128_enc_tail @ handle tail 436 437 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext 438#ifdef __AARCH64EB__ 439 rev $input_l0, $input_l0 440 rev $input_h0, $input_h0 441#endif 442 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext 443#ifdef __AARCH64EB__ 444 rev $input_l2, $input_l2 445 rev $input_h2, $input_h2 446#endif 447 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext 448#ifdef __AARCH64EB__ 449 rev $input_l1, $input_l1 450 rev $input_h1, $input_h1 451#endif 452 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext 453#ifdef __AARCH64EB__ 454 rev $input_l3, $input_l3 455 rev $input_h3, $input_h3 456#endif 457 eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low 458 eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high 459 460 eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low 461 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low 462 463 eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low 464 eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high 465 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high 466 467 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low 468 eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high 469 470 eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low 471 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high 472 473 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low 474 eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high 475 rev $ctr32w, $rctr32w @ CTR block 4 476 477 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high 478 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 479 480 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result 481 fmov $ctr0d, $ctr96_b64x @ CTR block 4 482 add $rctr32w, $rctr32w, #1 @ CTR block 4 483 484 fmov $ctr0.d[1], $ctr32x @ CTR block 4 485 rev $ctr32w, $rctr32w @ CTR block 5 486 487 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result 488 fmov $ctr1d, $ctr96_b64x @ CTR block 5 489 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 490 491 add $rctr32w, $rctr32w, #1 @ CTR block 5 492 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 493 fmov $ctr1.d[1], $ctr32x @ CTR block 5 494 495 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low 496 rev $ctr32w, $rctr32w @ CTR block 6 497 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result 498 499 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high 500 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 501 502 add $rctr32w, $rctr32w, #1 @ CTR block 6 503 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result 504 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result 505 506 fmov $ctr2d, $ctr96_b64x @ CTR block 6 507 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 508 509 fmov $ctr2.d[1], $ctr32x @ CTR block 6 510 rev $ctr32w, $rctr32w @ CTR block 7 511 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result 512 513 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 514 515 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result 516 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result 517 b.ge .L128_enc_prepretail @ do prepretail 518 519 .L128_enc_main_loop: @ main loop start 520 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext 521#ifdef __AARCH64EB__ 522 rev $input_l3, $input_l3 523 rev $input_h3, $input_h3 524#endif 525 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 526 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 527 528 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 529 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 530 531 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 532 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 533 534 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 535 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 536 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 537 538 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 539 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 540 541 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 542 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 543 544 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 545 eor $res0b, $res0b, $acc_lb @ PRE 1 546 547 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 548 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high 549 550 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 551 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 552 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext 553#ifdef __AARCH64EB__ 554 rev $input_l0, $input_l0 555 rev $input_h0, $input_h0 556#endif 557 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 558 rev $ctr32w, $rctr32w @ CTR block 4k+8 559 560 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 561 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 562 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 563 564 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 565 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 566 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 567 568 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 569 570 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 571 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 572 573 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 574 575 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 576 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 577 578 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 579 580 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 581 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 582 583 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 584 585 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 586 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 587 588 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 589 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high 590 591 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 592 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 593 594 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 595 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 596 597 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 598 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low 599 600 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 601 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 602 603 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 604 605 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 606 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 607 608 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 609 610 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 611 movi $mod_constant.8b, #0xc2 612 613 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 614 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 615 616 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 617 618 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 619 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 620 621 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 622 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 623 624 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 625 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext 626#ifdef __AARCH64EB__ 627 rev $input_l1, $input_l1 628 rev $input_h1, $input_h1 629#endif 630 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 631 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 632 633 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 634 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext 635#ifdef __AARCH64EB__ 636 rev $input_l2, $input_l2 637 rev $input_h2, $input_h2 638#endif 639 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 640 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 641 642 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 643 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low 644 645 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 646 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 647 648 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 649 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low 650 651 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 652 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 653 654 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 655 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 656 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 657 658 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 659 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low 660 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 661 662 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 663 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low 664 665 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 666 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 667 668 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 669 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high 670 671 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 672 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high 673 674 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 675 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high 676 677 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 678 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 679 680 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 681 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 682 683 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 684 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low 685 eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high 686 687 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 688 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low 689 690 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 691 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high 692 693 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 694 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 695 696 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 697 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 698 699 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 700 rev $ctr32w, $rctr32w @ CTR block 4k+9 701 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 702 703 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 704 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result 705 706 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 707 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 708 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 709 710 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 711 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 712 rev $ctr32w, $rctr32w @ CTR block 4k+10 713 714 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 715 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result 716 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result 717 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 718 719 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 720 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 721 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 722 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 723 724 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 725 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result 726 727 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 728 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result 729 rev $ctr32w, $rctr32w @ CTR block 4k+11 730 731 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 732 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result 733 734 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 735 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result 736 b.lt .L128_enc_main_loop 737 738 .L128_enc_prepretail: @ PREPRETAIL 739 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 740 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 741 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 742 743 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 744 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 745 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 746 747 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 748 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 749 750 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 751 752 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 753 eor $res0b, $res0b, $acc_lb @ PRE 1 754 755 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 756 757 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 758 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 759 760 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 761 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 762 763 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 764 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 765 766 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 767 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 768 769 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 770 771 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 772 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 773 774 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 775 776 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 777 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 778 779 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 780 781 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 782 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 783 784 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 785 786 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 787 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 788 789 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 790 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 791 792 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 793 794 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 795 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 796 797 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 798 799 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 800 801 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 802 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 803 804 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 805 806 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 807 movi $mod_constant.8b, #0xc2 808 809 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 810 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 811 812 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 813 814 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 815 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 816 817 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 818 819 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 820 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 821 822 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 823 824 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 825 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 826 827 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 828 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 829 830 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 831 832 pmull $t1.1q, $acc_h.1d, $mod_constant.1d 833 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up 834 835 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 836 837 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 838 ext $acc_hb, $acc_hb, $acc_hb, #8 839 840 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 841 842 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 843 eor $acc_mb, $acc_mb, $acc_lb 844 845 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 846 847 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 848 849 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 850 851 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 852 eor $acc_mb, $acc_mb, $t1.16b 853 854 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 855 856 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 857 858 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 859 860 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 861 eor $acc_mb, $acc_mb, $acc_hb 862 863 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 864 865 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 866 867 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 868 869 pmull $t1.1q, $acc_m.1d, $mod_constant.1d 870 871 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 872 ext $acc_mb, $acc_mb, $acc_mb, #8 873 874 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 875 876 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 877 eor $acc_lb, $acc_lb, $t1.16b 878 879 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 880 881 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 882 883 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 884 885 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 886 887 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 888 eor $acc_lb, $acc_lb, $acc_mb 889 890 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 891 .L128_enc_tail: @ TAIL 892 893 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 894 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext 895#ifdef __AARCH64EB__ 896 rev $input_l0, $input_l0 897 rev $input_h0, $input_h0 898#endif 899 cmp $main_end_input_ptr, #48 900 901 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 902 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low 903 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high 904 905 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 906 907 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 908 909 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 910 911 b.gt .L128_enc_blocks_more_than_3 912 913 sub $rctr32w, $rctr32w, #1 914 movi $acc_l.8b, #0 915 mov $ctr3b, $ctr2b 916 917 cmp $main_end_input_ptr, #32 918 mov $ctr2b, $ctr1b 919 movi $acc_h.8b, #0 920 921 movi $acc_m.8b, #0 922 b.gt .L128_enc_blocks_more_than_2 923 924 mov $ctr3b, $ctr1b 925 cmp $main_end_input_ptr, #16 926 927 sub $rctr32w, $rctr32w, #1 928 b.gt .L128_enc_blocks_more_than_1 929 930 sub $rctr32w, $rctr32w, #1 931 b .L128_enc_blocks_less_than_1 932 .L128_enc_blocks_more_than_3: @ blocks left > 3 933 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result 934 935 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high 936#ifdef __AARCH64EB__ 937 rev $input_l0, $input_l0 938 rev $input_h0, $input_h0 939#endif 940 rev64 $res0b, $res1b @ GHASH final-3 block 941 942 eor $res0b, $res0b, $t0.16b @ feed in partial tag 943 eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high 944 eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low 945 946 fmov $res1d, $input_l0 @ AES final-2 block - mov low 947 948 movi $t0.8b, #0 @ suppress further partial tag feed in 949 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high 950 951 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 952 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 953 954 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 955 956 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 957 958 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result 959 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 960 961 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 962 .L128_enc_blocks_more_than_2: @ blocks left > 2 963 964 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result 965 966 rev64 $res0b, $res1b @ GHASH final-2 block 967 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high 968#ifdef __AARCH64EB__ 969 rev $input_l0, $input_l0 970 rev $input_h0, $input_h0 971#endif 972 eor $res0b, $res0b, $t0.16b @ feed in partial tag 973 974 eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low 975 976 fmov $res1d, $input_l0 @ AES final-1 block - mov low 977 eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high 978 979 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 980 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high 981 982 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 983 984 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 985 986 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 987 988 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 989 990 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result 991 992 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 993 994 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 995 996 movi $t0.8b, #0 @ suppress further partial tag feed in 997 998 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 999 .L128_enc_blocks_more_than_1: @ blocks left > 1 1000 1001 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result 1002 1003 rev64 $res0b, $res1b @ GHASH final-1 block 1004 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high 1005#ifdef __AARCH64EB__ 1006 rev $input_l0, $input_l0 1007 rev $input_h0, $input_h0 1008#endif 1009 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1010 1011 eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high 1012 eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low 1013 1014 fmov $res1d, $input_l0 @ AES final block - mov low 1015 1016 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 1017 fmov $res1.d[1], $input_h0 @ AES final block - mov high 1018 1019 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 1020 1021 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 1022 1023 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 1024 1025 eor $res1b, $res1b, $ctr3b @ AES final block - result 1026 1027 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 1028 1029 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 1030 1031 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 1032 1033 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 1034 1035 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 1036 movi $t0.8b, #0 @ suppress further partial tag feed in 1037 .L128_enc_blocks_less_than_1: @ blocks left <= 1 1038 1039 and $bit_length, $bit_length, #127 @ bit_length %= 128 1040 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff 1041 1042 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff 1043 sub $bit_length, $bit_length, #128 @ bit_length -= 128 1044 1045 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 1046 1047 and $bit_length, $bit_length, #127 @ bit_length %= 128 1048 1049 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block 1050 cmp $bit_length, #64 1051 1052 csel $input_l0, $rk10_l, $rk10_h, lt 1053 csel $input_h0, $rk10_h, xzr, lt 1054 1055 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block 1056 1057 fmov $ctr0.d[1], $input_h0 1058 1059 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 1060 1061 rev64 $res0b, $res1b @ GHASH final block 1062 1063 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1064 1065 mov $t0d, $res0.d[1] @ GHASH final block - mid 1066 1067 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 1068 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored 1069 1070 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 1071#ifndef __AARCH64EB__ 1072 rev $ctr32w, $rctr32w 1073#else 1074 mov $ctr32w, $rctr32w 1075#endif 1076 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 1077 1078 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 1079 1080 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 1081 1082 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 1083 1084 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 1085 movi $mod_constant.8b, #0xc2 1086 1087 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1088 1089 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1090 1091 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1092 1093 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1094 1095 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1096 1097 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 1098 1099 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 1100 1101 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 1102 1103 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 1104 1105 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing 1106 1107 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 1108 st1 { $res1b}, [$output_ptr] @ store all 16B 1109 1110 str $ctr32w, [$counter, #12] @ store the updated counter 1111 1112 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 1113 ext $acc_lb, $acc_lb, $acc_lb, #8 1114 rev64 $acc_lb, $acc_lb 1115 mov x0, $len 1116 st1 { $acc_l.16b }, [$current_tag] 1117 ldp x21, x22, [sp, #16] 1118 ldp x23, x24, [sp, #32] 1119 ldp d8, d9, [sp, #48] 1120 ldp d10, d11, [sp, #64] 1121 ldp d12, d13, [sp, #80] 1122 ldp d14, d15, [sp, #96] 1123 ldp x19, x20, [sp], #112 1124 ret 1125 1126.L128_enc_ret: 1127 mov w0, #0x0 1128 ret 1129.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 1130___ 1131 1132######################################################################################### 1133# size_t aes_gcm_dec_128_kernel(const unsigned char *in, 1134# size_t len, 1135# unsigned char *out, 1136# const void *key, 1137# unsigned char ivec[16], 1138# u64 *Xi); 1139# 1140$code.=<<___; 1141.global aes_gcm_dec_128_kernel 1142.type aes_gcm_dec_128_kernel,%function 1143.align 4 1144aes_gcm_dec_128_kernel: 1145 cbz x1, .L128_dec_ret 1146 stp x19, x20, [sp, #-112]! 1147 mov x16, x4 1148 mov x8, x5 1149 stp x21, x22, [sp, #16] 1150 stp x23, x24, [sp, #32] 1151 stp d8, d9, [sp, #48] 1152 stp d10, d11, [sp, #64] 1153 stp d12, d13, [sp, #80] 1154 stp d14, d15, [sp, #96] 1155 1156 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 1157 mov $len, $main_end_input_ptr 1158 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 1159#ifdef __AARCH64EB__ 1160 rev $ctr96_b64x, $ctr96_b64x 1161 rev $ctr96_t32x, $ctr96_t32x 1162#endif 1163 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 1164#ifdef __AARCH64EB__ 1165 ror $rk10_h, $rk10_h, 32 1166 ror $rk10_l, $rk10_l, 32 1167#endif 1168 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 1169 ld1 {$rk0s}, [$cc], #16 @ load rk0 1170 1171 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1172 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 1173 1174 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 1175#ifndef __AARCH64EB__ 1176 ext $h2b, $h2b, $h2b, #8 1177#endif 1178 lsr $rctr32x, $ctr96_t32x, #32 1179 fmov $ctr2d, $ctr96_b64x @ CTR block 2 1180 1181 ld1 {$rk1s}, [$cc], #16 @ load rk1 1182 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 1183 rev $rctr32w, $rctr32w @ rev_ctr32 1184 1185 fmov $ctr1d, $ctr96_b64x @ CTR block 1 1186 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 1187 1188 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 1189 rev $ctr32w, $rctr32w @ CTR block 1 1190 1191 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 1192 ld1 {$rk2s}, [$cc], #16 @ load rk2 1193 add $rctr32w, $rctr32w, #1 @ CTR block 1 1194 1195 fmov $ctr1.d[1], $ctr32x @ CTR block 1 1196 rev $ctr32w, $rctr32w @ CTR block 2 1197 add $rctr32w, $rctr32w, #1 @ CTR block 2 1198 1199 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 1200 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 1201 1202 fmov $ctr2.d[1], $ctr32x @ CTR block 2 1203 rev $ctr32w, $rctr32w @ CTR block 3 1204 1205 fmov $ctr3d, $ctr96_b64x @ CTR block 3 1206 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 1207 add $rctr32w, $rctr32w, #1 @ CTR block 3 1208 1209 fmov $ctr3.d[1], $ctr32x @ CTR block 3 1210 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 1211 1212 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 1213 ld1 {$rk3s}, [$cc], #16 @ load rk3 1214 1215 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 1216 ld1 {$rk4s}, [$cc], #16 @ load rk4 1217 1218 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 1219 ld1 {$rk5s}, [$cc], #16 @ load rk5 1220 1221 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 1222 ld1 {$rk6s}, [$cc], #16 @ load rk6 1223 1224 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 1225 1226 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 1227 1228 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 1229 1230 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 1231 ld1 { $acc_lb}, [$current_tag] 1232 ext $acc_lb, $acc_lb, $acc_lb, #8 1233 rev64 $acc_lb, $acc_lb 1234 1235 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 1236 ld1 {$rk7s}, [$cc], #16 @ load rk7 1237 1238 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 1239 1240 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 1241 1242 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 1243 ld1 {$rk8s}, [$cc], #16 @ load rk8 1244 1245 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 1246 1247 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 1248 1249 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 1250 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 1251#ifndef __AARCH64EB__ 1252 ext $h3b, $h3b, $h3b, #8 1253#endif 1254 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 1255 ld1 {$rk9s}, [$cc], #16 @ load rk9 1256 1257 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 1258 1259 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 1260 1261 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 1262 1263 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 1264 1265 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 1266 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 1267#ifndef __AARCH64EB__ 1268 ext $h1b, $h1b, $h1b, #8 1269#endif 1270 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 1271 1272 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 1273 1274 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 1275 1276 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 1277 1278 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 1279 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 1280 1281 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 1282#ifndef __AARCH64EB__ 1283 ext $h4b, $h4b, $h4b, #8 1284#endif 1285 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 1286 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 1287 1288 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 1289 1290 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 1291 1292 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 1293 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 1294 1295 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 1296 1297 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 1298 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 1299 1300 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 1301 1302 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 1303 1304 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 1305 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 1306 1307 aese $ctr2b, $rk9 @ AES block 2 - round 9 1308 1309 aese $ctr3b, $rk9 @ AES block 3 - round 9 1310 1311 aese $ctr0b, $rk9 @ AES block 0 - round 9 1312 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 1313 1314 aese $ctr1b, $rk9 @ AES block 1 - round 9 1315 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 1316 b.ge .L128_dec_tail @ handle tail 1317 1318 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext 1319 1320 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result 1321 ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext 1322 1323 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result 1324 rev64 $res0b, $res0b @ GHASH block 0 1325 rev $ctr32w, $rctr32w @ CTR block 4 1326 1327 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 1328 add $rctr32w, $rctr32w, #1 @ CTR block 4 1329 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext 1330 1331 rev64 $res1b, $res1b @ GHASH block 1 1332 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low 1333 1334 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high 1335 1336 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low 1337 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 1338 1339 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high 1340 1341 fmov $ctr0d, $ctr96_b64x @ CTR block 4 1342 1343 fmov $ctr0.d[1], $ctr32x @ CTR block 4 1344 rev $ctr32w, $rctr32w @ CTR block 5 1345 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low 1346#ifdef __AARCH64EB__ 1347 rev $output_l1, $output_l1 1348#endif 1349 fmov $ctr1d, $ctr96_b64x @ CTR block 5 1350 add $rctr32w, $rctr32w, #1 @ CTR block 5 1351 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 1352 1353 fmov $ctr1.d[1], $ctr32x @ CTR block 5 1354 rev $ctr32w, $rctr32w @ CTR block 6 1355 add $rctr32w, $rctr32w, #1 @ CTR block 6 1356 1357 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 1358 1359 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high 1360#ifdef __AARCH64EB__ 1361 rev $output_h1, $output_h1 1362#endif 1363 eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low 1364#ifdef __AARCH64EB__ 1365 rev $output_l0, $output_l0 1366#endif 1367 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result 1368 1369 eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high 1370#ifdef __AARCH64EB__ 1371 rev $output_h0, $output_h0 1372#endif 1373 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result 1374 1375 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result 1376 b.ge .L128_dec_prepretail @ do prepretail 1377 1378 .L128_dec_main_loop: @ main loop start 1379 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 1380 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 1381 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 1382 1383 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 1384 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 1385 1386 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 1387 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 1388 1389 rev64 $res2b, $res2b @ GHASH block 4k+2 1390 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 1391 rev $ctr32w, $rctr32w @ CTR block 4k+7 1392 1393 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 1394 eor $res0b, $res0b, $acc_lb @ PRE 1 1395 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 1396 1397 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 1398 rev64 $res3b, $res3b @ GHASH block 4k+3 1399 1400 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 1401 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 1402 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 1403 1404 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 1405 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 1406 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 1407 1408 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 1409 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 1410 1411 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 1412 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 1413 1414 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 1415 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 1416 1417 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 1418 1419 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 1420 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 1421 1422 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 1423 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 1424 1425 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 1426 1427 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 1428 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 1429 1430 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 1431 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low 1432#ifdef __AARCH64EB__ 1433 rev $output_l3, $output_l3 1434#endif 1435 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 1436 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high 1437#ifdef __AARCH64EB__ 1438 rev $output_h2, $output_h2 1439#endif 1440 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 1441 1442 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 1443 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 1444 1445 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 1446 1447 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 1448 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 1449 1450 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 1451 1452 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 1453 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 1454 1455 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 1456 1457 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 1458 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 1459 1460 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 1461 1462 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 1463 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 1464 1465 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 1466 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 1467 1468 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 1469 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high 1470#ifdef __AARCH64EB__ 1471 rev $output_h3, $output_h3 1472#endif 1473 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 1474 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 1475 1476 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 1477 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low 1478#ifdef __AARCH64EB__ 1479 rev $output_l2, $output_l2 1480#endif 1481 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 1482 movi $mod_constant.8b, #0xc2 1483 1484 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 1485 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 1486 1487 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 1488 1489 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 1490 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 1491 1492 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 1493 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 1494 1495 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 1496 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 1497 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext 1498 1499 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 1500 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 1501 1502 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 1503 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1504 1505 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 1506 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 1507 1508 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 1509 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 1510 1511 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 1512 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1513 1514 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 1515 rev $ctr32w, $rctr32w @ CTR block 4k+8 1516 1517 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1518 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 1519 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1520 1521 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 1522 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 1523 1524 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 1525 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1526 1527 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 1528 1529 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 1530 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result 1531 1532 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 1533 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext 1534 1535 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 1536 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 1537 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result 1538 1539 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 1540 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext 1541 1542 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 1543 1544 rev64 $res1b, $res1b @ GHASH block 4k+5 1545 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 1546 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 1547 1548 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 1549 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 1550 1551 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 1552 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 1553 1554 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 1555 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 1556 rev $ctr32w, $rctr32w @ CTR block 4k+9 1557 1558 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 1559 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 1560 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 1561 1562 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 1563 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high 1564#ifdef __AARCH64EB__ 1565 rev $output_h0, $output_h0 1566#endif 1567 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 1568 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high 1569 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low 1570#ifdef __AARCH64EB__ 1571 rev $output_l0, $output_l0 1572#endif 1573 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result 1574 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low 1575 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 1576 1577 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 1578 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 1579 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 1580 1581 rev64 $res0b, $res0b @ GHASH block 4k+4 1582 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 1583 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 1584 1585 rev $ctr32w, $rctr32w @ CTR block 4k+10 1586 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 1587 1588 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high 1589#ifdef __AARCH64EB__ 1590 rev $output_h1, $output_h1 1591#endif 1592 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result 1593 1594 eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low 1595#ifdef __AARCH64EB__ 1596 rev $output_l1, $output_l1 1597#endif 1598 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result 1599 1600 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 1601 b.lt L128_dec_main_loop 1602 1603 .L128_dec_prepretail: @ PREPRETAIL 1604 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 1605 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 1606 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 1607 1608 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 1609 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 1610 1611 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 1612 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 1613 1614 eor $res0b, $res0b, $acc_lb @ PRE 1 1615 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 1616 rev64 $res2b, $res2b @ GHASH block 4k+2 1617 1618 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 1619 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 1620 1621 rev $ctr32w, $rctr32w @ CTR block 4k+7 1622 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 1623 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 1624 1625 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 1626 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 1627 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 1628 1629 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 1630 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 1631 1632 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 1633 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 1634 1635 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 1636 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 1637 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 1638 1639 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 1640 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 1641 1642 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 1643 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 1644 1645 rev64 $res3b, $res3b @ GHASH block 4k+3 1646 1647 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 1648 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 1649 1650 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 1651 1652 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 1653 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 1654 1655 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 1656 1657 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 1658 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 1659 1660 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 1661 1662 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 1663 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 1664 1665 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 1666 1667 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 1668 1669 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 1670 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 1671 1672 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 1673 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 1674 1675 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 1676 1677 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 1678 movi $mod_constant.8b, #0xc2 1679 1680 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 1681 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 1682 1683 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 1684 1685 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 1686 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 1687 1688 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 1689 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low 1690#ifdef __AARCH64EB__ 1691 rev $output_l3, $output_l3 1692#endif 1693 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 1694 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low 1695#ifdef __AARCH64EB__ 1696 rev $output_l2, $output_l2 1697#endif 1698 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 1699 1700 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 1701 1702 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 1703 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1704 1705 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 1706 1707 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 1708 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 1709 1710 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 1711 1712 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 1713 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1714 1715 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 1716 1717 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 1718 1719 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 1720 1721 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 1722 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1723 1724 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1725 1726 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 1727 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1728 1729 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 1730 1731 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 1732 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 1733 1734 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 1735 1736 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 1737 1738 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 1739 1740 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 1741 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 1742 1743 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 1744 1745 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 1746 1747 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 1748 1749 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 1750 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high 1751#ifdef __AARCH64EB__ 1752 rev $output_h3, $output_h3 1753#endif 1754 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 1755 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 1756 1757 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 1758 1759 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 1760 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 1761 1762 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 1763 1764 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 1765 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high 1766#ifdef __AARCH64EB__ 1767 rev $output_h2, $output_h2 1768#endif 1769 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 1770 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 1771 1772 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 1773 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 1774 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 1775 1776 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 1777 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 1778 .L128_dec_tail: @ TAIL 1779 1780 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 1781 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 1782 1783 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result 1784 1785 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 1786 1787 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 1788 1789 cmp $main_end_input_ptr, #48 1790 1791 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high 1792#ifdef __AARCH64EB__ 1793 rev $output_h0, $output_h0 1794#endif 1795 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 1796 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low 1797#ifdef __AARCH64EB__ 1798 rev $output_l0, $output_l0 1799#endif 1800 b.gt .L128_dec_blocks_more_than_3 1801 1802 mov $ctr3b, $ctr2b 1803 sub $rctr32w, $rctr32w, #1 1804 movi $acc_l.8b, #0 1805 1806 movi $acc_h.8b, #0 1807 mov $ctr2b, $ctr1b 1808 1809 movi $acc_m.8b, #0 1810 cmp $main_end_input_ptr, #32 1811 b.gt .L128_dec_blocks_more_than_2 1812 1813 cmp $main_end_input_ptr, #16 1814 1815 mov $ctr3b, $ctr1b 1816 sub $rctr32w, $rctr32w, #1 1817 b.gt .L128_dec_blocks_more_than_1 1818 1819 sub $rctr32w, $rctr32w, #1 1820 b .L128_dec_blocks_less_than_1 1821 .L128_dec_blocks_more_than_3: @ blocks left > 3 1822 rev64 $res0b, $res1b @ GHASH final-3 block 1823 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext 1824 1825 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1826 1827 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 1828 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result 1829 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result 1830 1831 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 1832 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high 1833 1834 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 1835 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low 1836 1837 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 1838 1839 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 1840 1841 movi $t0.8b, #0 @ suppress further partial tag feed in 1842 eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high 1843#ifdef __AARCH64EB__ 1844 rev $output_h0, $output_h0 1845#endif 1846 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 1847 eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low 1848#ifdef __AARCH64EB__ 1849 rev $output_l0, $output_l0 1850#endif 1851 .L128_dec_blocks_more_than_2: @ blocks left > 2 1852 1853 rev64 $res0b, $res1b @ GHASH final-2 block 1854 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext 1855 1856 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1857 1858 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result 1859 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result 1860 1861 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 1862 1863 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 1864 1865 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 1866 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low 1867 1868 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high 1869 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 1870 1871 movi $t0.8b, #0 @ suppress further partial tag feed in 1872 1873 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 1874 1875 eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low 1876#ifdef __AARCH64EB__ 1877 rev $output_l0, $output_l0 1878#endif 1879 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 1880 1881 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 1882 1883 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 1884 eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high 1885#ifdef __AARCH64EB__ 1886 rev $output_h0, $output_h0 1887#endif 1888 .L128_dec_blocks_more_than_1: @ blocks left > 1 1889 1890 rev64 $res0b, $res1b @ GHASH final-1 block 1891 1892 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext 1893 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1894 1895 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 1896 1897 eor $ctr0b, $res1b, $ctr3b @ AES final block - result 1898 1899 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 1900 1901 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result 1902 mov $output_l0, $ctr0.d[0] @ AES final block - mov low 1903 1904 mov $output_h0, $ctr0.d[1] @ AES final block - mov high 1905 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 1906 1907 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 1908 1909 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 1910 1911 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 1912 movi $t0.8b, #0 @ suppress further partial tag feed in 1913 1914 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 1915 1916 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 1917 eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high 1918#ifdef __AARCH64EB__ 1919 rev $output_h0, $output_h0 1920#endif 1921 eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low 1922#ifdef __AARCH64EB__ 1923 rev $output_l0, $output_l0 1924#endif 1925 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 1926 .L128_dec_blocks_less_than_1: @ blocks left <= 1 1927 1928 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff 1929 and $bit_length, $bit_length, #127 @ bit_length %= 128 1930 1931 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff 1932 sub $bit_length, $bit_length, #128 @ bit_length -= 128 1933 1934 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 1935 1936 and $bit_length, $bit_length, #127 @ bit_length %= 128 1937 1938 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block 1939 cmp $bit_length, #64 1940 1941 csel $ctr96_b64x, $rk10_h, xzr, lt 1942 csel $ctr32x, $rk10_l, $rk10_h, lt 1943 1944 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block 1945 1946 mov $ctr0.d[1], $ctr96_b64x 1947 1948 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 1949 1950 rev64 $res0b, $res1b @ GHASH final block 1951 1952 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1953 1954 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite 1955 1956 and $output_h0, $output_h0, $ctr96_b64x 1957 1958 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 1959 mov $t0d, $res0.d[1] @ GHASH final block - mid 1960 1961 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 1962 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 1963 1964 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 1965 1966 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 1967 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes 1968 and $output_l0, $output_l0, $ctr32x 1969 1970#ifndef __AARCH64EB__ 1971 rev $ctr32w, $rctr32w 1972#else 1973 mov $ctr32w, $rctr32w 1974#endif 1975 1976 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 1977 movi $mod_constant.8b, #0xc2 1978 1979 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 1980 1981 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes 1982 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1983 1984 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1985 1986 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1987 1988 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1989 1990 orr $output_l0, $output_l0, $end_input_ptr 1991 str $ctr32w, [$counter, #12] @ store the updated counter 1992 1993 orr $output_h0, $output_h0, $main_end_input_ptr 1994 stp $output_l0, $output_h0, [$output_ptr] 1995 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1996 1997 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 1998 1999 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 2000 2001 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 2002 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 2003 2004 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 2005 2006 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 2007 ext $acc_lb, $acc_lb, $acc_lb, #8 2008 rev64 $acc_lb, $acc_lb 2009 mov x0, $len 2010 st1 { $acc_l.16b }, [$current_tag] 2011 2012 ldp x21, x22, [sp, #16] 2013 ldp x23, x24, [sp, #32] 2014 ldp d8, d9, [sp, #48] 2015 ldp d10, d11, [sp, #64] 2016 ldp d12, d13, [sp, #80] 2017 ldp d14, d15, [sp, #96] 2018 ldp x19, x20, [sp], #112 2019 ret 2020 2021 .L128_dec_ret: 2022 mov w0, #0x0 2023 ret 2024.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 2025___ 2026} 2027 2028{ 2029my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); 2030my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); 2031my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); 2032my ($output_l0,$output_h0)=map("x$_",(6..7)); 2033 2034my $ctr32w="w9"; 2035my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15)); 2036my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); 2037 2038my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); 2039my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); 2040my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); 2041my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); 2042 2043my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); 2044my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); 2045my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); 2046 2047my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); 2048my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); 2049my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); 2050 2051my $t0="v8"; 2052my $t0d="d8"; 2053my $t3="v4"; 2054my $t3d="d4"; 2055 2056my ($t1,$t2)=map("v$_",(30..31)); 2057my ($t1d,$t2d)=map("d$_",(30..31)); 2058 2059my $t4="v30"; 2060my $t4d="d30"; 2061my $t5="v8"; 2062my $t5d="d8"; 2063my $t6="v31"; 2064my $t6d="d31"; 2065 2066my $t7="v5"; 2067my $t7d="d5"; 2068my $t8="v6"; 2069my $t8d="d6"; 2070my $t9="v30"; 2071my $t9d="d30"; 2072 2073my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); 2074my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); 2075my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); 2076 2077my $mod_constantd="d8"; 2078my $mod_constant="v8"; 2079my $mod_t="v31"; 2080 2081my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29)); 2082my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29)); 2083my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29)); 2084my $rk2q1="v20.1q"; 2085my $rk3q1="v21.1q"; 2086my $rk4v="v22"; 2087my $rk4d="d22"; 2088 2089######################################################################################### 2090# size_t aes_gcm_enc_192_kernel(const unsigned char *in, 2091# size_t len, 2092# unsigned char *out, 2093# const void *key, 2094# unsigned char ivec[16], 2095# u64 *Xi); 2096# 2097$code.=<<___; 2098.global aes_gcm_enc_192_kernel 2099.type aes_gcm_enc_192_kernel,%function 2100.align 4 2101aes_gcm_enc_192_kernel: 2102 cbz x1, .L192_enc_ret 2103 stp x19, x20, [sp, #-112]! 2104 mov x16, x4 2105 mov x8, x5 2106 stp x21, x22, [sp, #16] 2107 stp x23, x24, [sp, #32] 2108 stp d8, d9, [sp, #48] 2109 stp d10, d11, [sp, #64] 2110 stp d12, d13, [sp, #80] 2111 stp d14, d15, [sp, #96] 2112 2113 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 2114#ifdef __AARCH64EB__ 2115 rev $ctr96_b64x, $ctr96_b64x 2116 rev $ctr96_t32x, $ctr96_t32x 2117#endif 2118 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 2119#ifdef __AARCH64EB__ 2120 ror $rk12_l, $rk12_l, #32 2121 ror $rk12_h, $rk12_h, #32 2122#endif 2123 ld1 {$rk0s}, [$cc], #16 @ load rk0 2124 2125 ld1 {$rk1s}, [$cc], #16 @ load rk1 2126 2127 ld1 {$rk2s}, [$cc], #16 @ load rk2 2128 2129 lsr $rctr32x, $ctr96_t32x, #32 2130 ld1 {$rk3s}, [$cc], #16 @ load rk3 2131 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 2132 2133 ld1 {$rk4s}, [$cc], #16 @ load rk4 2134 rev $rctr32w, $rctr32w @ rev_ctr32 2135 2136 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 2137 fmov $ctr3d, $ctr96_b64x @ CTR block 3 2138 2139 rev $ctr32w, $rctr32w @ CTR block 1 2140 add $rctr32w, $rctr32w, #1 @ CTR block 1 2141 fmov $ctr1d, $ctr96_b64x @ CTR block 1 2142 2143 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 2144 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 2145 2146 fmov $ctr1.d[1], $ctr32x @ CTR block 1 2147 rev $ctr32w, $rctr32w @ CTR block 2 2148 add $rctr32w, $rctr32w, #1 @ CTR block 2 2149 2150 fmov $ctr2d, $ctr96_b64x @ CTR block 2 2151 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 2152 2153 fmov $ctr2.d[1], $ctr32x @ CTR block 2 2154 rev $ctr32w, $rctr32w @ CTR block 3 2155 2156 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 2157 ld1 {$rk5s}, [$cc], #16 @ load rk5 2158 2159 fmov $ctr3.d[1], $ctr32x @ CTR block 3 2160 2161 ld1 {$rk6s}, [$cc], #16 @ load rk6 2162 2163 ld1 {$rk7s}, [$cc], #16 @ load rk7 2164 2165 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 2166 ld1 { $acc_lb}, [$current_tag] 2167 ext $acc_lb, $acc_lb, $acc_lb, #8 2168 rev64 $acc_lb, $acc_lb 2169 2170 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 2171 ld1 {$rk8s}, [$cc], #16 @ load rk8 2172 2173 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 2174 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 2175#ifndef __AARCH64EB__ 2176 ext $h4b, $h4b, $h4b, #8 2177#endif 2178 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 2179 ld1 {$rk9s}, [$cc], #16 @ load rk9 2180 2181 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 2182 ld1 {$rk10s}, [$cc], #16 @ load rk10 2183 2184 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 2185 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 2186#ifndef __AARCH64EB__ 2187 ext $h1b, $h1b, $h1b, #8 2188#endif 2189 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 2190 ld1 {$rk11s}, [$cc], #16 @ load rk11 2191 2192 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 2193 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 2194#ifndef __AARCH64EB__ 2195 ext $h3b, $h3b, $h3b, #8 2196#endif 2197 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 2198 2199 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 2200 2201 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 2202 2203 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 2204 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 2205 2206 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 2207 2208 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 2209 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 2210 2211 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 2212 2213 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 2214 2215 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 2216 2217 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 2218 2219 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 2220 2221 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 2222 2223 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 2224 2225 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 2226 2227 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 2228 2229 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 2230 2231 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 2232 2233 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 2234 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 2235#ifndef __AARCH64EB__ 2236 ext $h2b, $h2b, $h2b, #8 2237#endif 2238 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 2239 2240 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 2241 2242 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 2243 2244 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 2245 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 2246 2247 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 2248 2249 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 2250 2251 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 2252 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 2253 2254 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 2255 2256 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 2257 2258 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 2259 2260 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 2261 2262 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 2263 2264 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 2265 2266 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 2267 2268 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 2269 2270 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 2271 2272 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 2273 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 2274 mov $len, $main_end_input_ptr 2275 2276 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 2277 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 2278 2279 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 2280 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2281 2282 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 2283 2284 aese $ctr2b, $rk11 @ AES block 2 - round 11 2285 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 2286 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 2287 2288 aese $ctr1b, $rk11 @ AES block 1 - round 11 2289 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 2290 2291 aese $ctr0b, $rk11 @ AES block 0 - round 11 2292 add $rctr32w, $rctr32w, #1 @ CTR block 3 2293 2294 aese $ctr3b, $rk11 @ AES block 3 - round 11 2295 b.ge .L192_enc_tail @ handle tail 2296 2297 rev $ctr32w, $rctr32w @ CTR block 4 2298 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext 2299#ifdef __AARCH64EB__ 2300 rev $input_l0, $input_l0 2301 rev $input_h0, $input_h0 2302#endif 2303 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 2304 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext 2305#ifdef __AARCH64EB__ 2306 rev $input_l2, $input_l2 2307 rev $input_h2, $input_h2 2308#endif 2309 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext 2310#ifdef __AARCH64EB__ 2311 rev $input_l3, $input_l3 2312 rev $input_h3, $input_h3 2313#endif 2314 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext 2315#ifdef __AARCH64EB__ 2316 rev $input_l1, $input_l1 2317 rev $input_h1, $input_h1 2318#endif 2319 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 2320 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 2321 2322 eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low 2323 2324 eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high 2325 eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high 2326 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low 2327 2328 eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high 2329 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high 2330 2331 eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low 2332 eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low 2333 2334 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low 2335 eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high 2336 2337 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high 2338 2339 eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low 2340 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low 2341 2342 add $rctr32w, $rctr32w, #1 @ CTR block 4 2343 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result 2344 fmov $ctr0d, $ctr96_b64x @ CTR block 4 2345 2346 fmov $ctr0.d[1], $ctr32x @ CTR block 4 2347 rev $ctr32w, $rctr32w @ CTR block 5 2348 2349 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 2350 add $rctr32w, $rctr32w, #1 @ CTR block 5 2351 2352 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low 2353 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result 2354 2355 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high 2356 2357 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result 2358 fmov $ctr1d, $ctr96_b64x @ CTR block 5 2359 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result 2360 2361 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high 2362 2363 fmov $ctr1.d[1], $ctr32x @ CTR block 5 2364 rev $ctr32w, $rctr32w @ CTR block 6 2365 2366 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 2367 2368 add $rctr32w, $rctr32w, #1 @ CTR block 6 2369 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result 2370 fmov $ctr2d, $ctr96_b64x @ CTR block 6 2371 2372 fmov $ctr2.d[1], $ctr32x @ CTR block 6 2373 rev $ctr32w, $rctr32w @ CTR block 7 2374 2375 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 2376 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result 2377 2378 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result 2379 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result 2380 b.ge .L192_enc_prepretail @ do prepretail 2381 2382 .L192_enc_main_loop: @ main loop start 2383 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 2384 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 2385 2386 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 2387 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext 2388#ifdef __AARCH64EB__ 2389 rev $input_l1, $input_l1 2390 rev $input_h1, $input_h1 2391#endif 2392 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 2393 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 2394 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 2395 2396 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 2397 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 2398 2399 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 2400 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2401 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext 2402#ifdef __AARCH64EB__ 2403 rev $input_l2, $input_l2 2404 rev $input_h2, $input_h2 2405#endif 2406 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 2407 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext 2408#ifdef __AARCH64EB__ 2409 rev $input_l3, $input_l3 2410 rev $input_h3, $input_h3 2411#endif 2412 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 2413 eor $res0b, $res0b, $acc_lb @ PRE 1 2414 2415 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 2416 2417 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 2418 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 2419 2420 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 2421 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high 2422 2423 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 2424 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 2425 2426 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 2427 2428 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 2429 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low 2430 2431 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 2432 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 2433 2434 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 2435 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low 2436 2437 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 2438 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 2439 2440 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 2441 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 2442 2443 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 2444 2445 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 2446 2447 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 2448 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 2449 2450 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 2451 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 2452 2453 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 2454 2455 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 2456 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 2457 2458 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 2459 2460 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 2461 eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high 2462 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 2463 2464 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 2465 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 2466 2467 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 2468 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 2469 2470 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 2471 eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high 2472 2473 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 2474 eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low 2475 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 2476 2477 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 2478 rev $ctr32w, $rctr32w @ CTR block 4k+8 2479 2480 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 2481 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 2482 2483 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 2484 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 2485 2486 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 2487 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext 2488#ifdef __AARCH64EB__ 2489 rev $input_l0, $input_l0 2490 rev $input_h0, $input_h0 2491#endif 2492 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 2493 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 2494 2495 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 2496 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 2497 2498 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 2499 movi $mod_constant.8b, #0xc2 2500 2501 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 2502 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high 2503 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 2504 2505 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 2506 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low 2507 2508 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 2509 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 2510 2511 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 2512 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 2513 2514 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 2515 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low 2516 2517 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 2518 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 2519 2520 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 2521 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high 2522 2523 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 2524 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 2525 2526 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 2527 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 2528 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 2529 2530 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 2531 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 2532 2533 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 2534 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low 2535 2536 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 2537 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 2538 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 2539 2540 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 2541 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high 2542 2543 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 2544 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 2545 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low 2546 2547 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 2548 2549 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 2550 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 2551 2552 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 2553 2554 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 2555 2556 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 2557 2558 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 2559 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 2560 2561 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 2562 2563 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 2564 2565 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 2566 2567 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 2568 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 2569 2570 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 2571 2572 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 2573 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 2574 2575 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 2576 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 2577 rev $ctr32w, $rctr32w @ CTR block 4k+9 2578 2579 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 2580 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high 2581 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result 2582 2583 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 2584 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 2585 2586 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result 2587 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 2588 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 2589 2590 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 2591 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 2592 rev $ctr32w, $rctr32w @ CTR block 4k+10 2593 2594 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 2595 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 2596 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 2597 2598 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result 2599 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 2600 2601 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 2602 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result 2603 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 2604 2605 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result 2606 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 2607 rev $ctr32w, $rctr32w @ CTR block 4k+11 2608 2609 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 2610 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 2611 2612 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result 2613 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result 2614 b.lt .L192_enc_main_loop 2615 2616 .L192_enc_prepretail: @ PREPRETAIL 2617 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 2618 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 2619 2620 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 2621 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 2622 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 2623 2624 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 2625 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 2626 2627 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 2628 2629 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 2630 eor $res0b, $res0b, $acc_lb @ PRE 1 2631 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 2632 2633 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 2634 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 2635 2636 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 2637 2638 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 2639 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 2640 2641 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 2642 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2643 2644 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 2645 2646 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 2647 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 2648 2649 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 2650 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 2651 2652 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 2653 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 2654 2655 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 2656 2657 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 2658 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 2659 2660 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 2661 2662 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 2663 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 2664 2665 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 2666 2667 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 2668 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 2669 2670 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 2671 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 2672 2673 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 2674 2675 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 2676 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 2677 2678 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 2679 2680 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 2681 2682 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 2683 2684 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 2685 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 2686 2687 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 2688 2689 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 2690 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 2691 2692 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 2693 2694 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 2695 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 2696 2697 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 2698 2699 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 2700 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 2701 2702 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 2703 2704 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 2705 movi $mod_constant.8b, #0xc2 2706 2707 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 2708 2709 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 2710 2711 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 2712 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 2713 2714 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 2715 2716 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 2717 2718 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 2719 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 2720 2721 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 2722 2723 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 2724 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up 2725 2726 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 2727 2728 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 2729 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 2730 2731 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 2732 2733 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 2734 eor $acc_mb, $acc_mb, $acc_lb 2735 2736 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 2737 2738 pmull $t1.1q, $acc_h.1d, $mod_constant.1d 2739 2740 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 2741 ext $acc_hb, $acc_hb, $acc_hb, #8 2742 2743 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 2744 2745 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 2746 eor $acc_mb, $acc_mb, $t1.16b 2747 2748 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 2749 2750 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 2751 2752 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 2753 2754 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 2755 eor $acc_mb, $acc_mb, $acc_hb 2756 2757 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 2758 2759 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 2760 2761 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 2762 2763 pmull $t1.1q, $acc_m.1d, $mod_constant.1d 2764 2765 ext $acc_mb, $acc_mb, $acc_mb, #8 2766 2767 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 2768 2769 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 2770 2771 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 2772 2773 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 2774 eor $acc_lb, $acc_lb, $t1.16b 2775 2776 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 2777 2778 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 2779 2780 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 2781 2782 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 2783 eor $acc_lb, $acc_lb, $acc_mb 2784 .L192_enc_tail: @ TAIL 2785 2786 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 2787 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext 2788#ifdef __AARCH64EB__ 2789 rev $input_l0, $input_l0 2790 rev $input_h0, $input_h0 2791#endif 2792 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low 2793 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high 2794 2795 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 2796 2797 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 2798 cmp $main_end_input_ptr, #48 2799 2800 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 2801 2802 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 2803 b.gt .L192_enc_blocks_more_than_3 2804 2805 sub $rctr32w, $rctr32w, #1 2806 movi $acc_m.8b, #0 2807 2808 mov $ctr3b, $ctr2b 2809 movi $acc_h.8b, #0 2810 cmp $main_end_input_ptr, #32 2811 2812 mov $ctr2b, $ctr1b 2813 movi $acc_l.8b, #0 2814 b.gt .L192_enc_blocks_more_than_2 2815 2816 sub $rctr32w, $rctr32w, #1 2817 2818 mov $ctr3b, $ctr1b 2819 cmp $main_end_input_ptr, #16 2820 b.gt .L192_enc_blocks_more_than_1 2821 2822 sub $rctr32w, $rctr32w, #1 2823 b .L192_enc_blocks_less_than_1 2824 .L192_enc_blocks_more_than_3: @ blocks left > 3 2825 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result 2826 2827 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high 2828#ifdef __AARCH64EB__ 2829 rev $input_l0, $input_l0 2830 rev $input_h0, $input_h0 2831#endif 2832 rev64 $res0b, $res1b @ GHASH final-3 block 2833 2834 eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low 2835 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2836 2837 eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high 2838 fmov $res1d, $input_l0 @ AES final-2 block - mov low 2839 2840 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high 2841 2842 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 2843 2844 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 2845 2846 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 2847 2848 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 2849 2850 movi $t0.8b, #0 @ suppress further partial tag feed in 2851 2852 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 2853 2854 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 2855 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result 2856 .L192_enc_blocks_more_than_2: @ blocks left > 2 2857 2858 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result 2859 2860 rev64 $res0b, $res1b @ GHASH final-2 block 2861 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high 2862#ifdef __AARCH64EB__ 2863 rev $input_l0, $input_l0 2864 rev $input_h0, $input_h0 2865#endif 2866 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2867 2868 eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high 2869 2870 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 2871 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 2872 2873 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 2874 eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low 2875 2876 fmov $res1d, $input_l0 @ AES final-1 block - mov low 2877 2878 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high 2879 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 2880 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 2881 2882 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 2883 2884 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 2885 2886 movi $t0.8b, #0 @ suppress further partial tag feed in 2887 2888 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result 2889 2890 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 2891 .L192_enc_blocks_more_than_1: @ blocks left > 1 2892 2893 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result 2894 2895 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high 2896#ifdef __AARCH64EB__ 2897 rev $input_l0, $input_l0 2898 rev $input_h0, $input_h0 2899#endif 2900 rev64 $res0b, $res1b @ GHASH final-1 block 2901 2902 eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low 2903 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2904 movi $t0.8b, #0 @ suppress further partial tag feed in 2905 2906 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 2907 2908 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 2909 eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high 2910 fmov $res1d, $input_l0 @ AES final block - mov low 2911 2912 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 2913 fmov $res1.d[1], $input_h0 @ AES final block - mov high 2914 2915 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 2916 2917 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 2918 2919 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 2920 2921 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 2922 2923 eor $res1b, $res1b, $ctr3b @ AES final block - result 2924 2925 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 2926 2927 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 2928 .L192_enc_blocks_less_than_1: @ blocks left <= 1 2929 2930 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored 2931#ifndef __AARCH64EB__ 2932 rev $ctr32w, $rctr32w 2933#else 2934 mov $ctr32w, $rctr32w 2935#endif 2936 and $bit_length, $bit_length, #127 @ bit_length %= 128 2937 2938 sub $bit_length, $bit_length, #128 @ bit_length -= 128 2939 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff 2940 2941 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 2942 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff 2943 2944 and $bit_length, $bit_length, #127 @ bit_length %= 128 2945 2946 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block 2947 cmp $bit_length, #64 2948 2949 csel $input_l0, $rk12_l, $rk12_h, lt 2950 csel $input_h0, $rk12_h, xzr, lt 2951 2952 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block 2953 2954 fmov $ctr0.d[1], $input_h0 2955 2956 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 2957 2958 rev64 $res0b, $res1b @ GHASH final block 2959 2960 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2961 2962 mov $t0d, $res0.d[1] @ GHASH final block - mid 2963 2964 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 2965 2966 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 2967 2968 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 2969 2970 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 2971 2972 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 2973 2974 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 2975 2976 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 2977 movi $mod_constant.8b, #0xc2 2978 2979 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 2980 2981 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 2982 2983 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing 2984 2985 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 2986 2987 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 2988 2989 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 2990 2991 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 2992 2993 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 2994 2995 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 2996 2997 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 2998 2999 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 3000 str $ctr32w, [$counter, #12] @ store the updated counter 3001 3002 st1 { $res1b}, [$output_ptr] @ store all 16B 3003 3004 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3005 ext $acc_lb, $acc_lb, $acc_lb, #8 3006 rev64 $acc_lb, $acc_lb 3007 mov x0, $len 3008 st1 { $acc_l.16b }, [$current_tag] 3009 3010 ldp x21, x22, [sp, #16] 3011 ldp x23, x24, [sp, #32] 3012 ldp d8, d9, [sp, #48] 3013 ldp d10, d11, [sp, #64] 3014 ldp d12, d13, [sp, #80] 3015 ldp d14, d15, [sp, #96] 3016 ldp x19, x20, [sp], #112 3017 ret 3018 3019.L192_enc_ret: 3020 mov w0, #0x0 3021 ret 3022.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3023___ 3024 3025######################################################################################### 3026# size_t aes_gcm_dec_192_kernel(const unsigned char *in, 3027# size_t len, 3028# unsigned char *out, 3029# const void *key, 3030# unsigned char ivec[16], 3031# u64 *Xi); 3032# 3033$code.=<<___; 3034.global aes_gcm_dec_192_kernel 3035.type aes_gcm_dec_192_kernel,%function 3036.align 4 3037aes_gcm_dec_192_kernel: 3038 cbz x1, .L192_dec_ret 3039 stp x19, x20, [sp, #-112]! 3040 mov x16, x4 3041 mov x8, x5 3042 stp x21, x22, [sp, #16] 3043 stp x23, x24, [sp, #32] 3044 stp d8, d9, [sp, #48] 3045 stp d10, d11, [sp, #64] 3046 stp d12, d13, [sp, #80] 3047 stp d14, d15, [sp, #96] 3048 3049 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 3050 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 3051#ifdef __AARCH64EB__ 3052 rev $ctr96_b64x, $ctr96_b64x 3053 rev $ctr96_t32x, $ctr96_t32x 3054#endif 3055 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 3056#ifdef __AARCH64EB__ 3057 ror $rk12_l, $rk12_l, #32 3058 ror $rk12_h, $rk12_h, #32 3059#endif 3060 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 3061 3062 ld1 {$rk0s}, [$cc], #16 @ load rk0 3063 3064 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 3065 mov $len, $main_end_input_ptr 3066 ld1 {$rk1s}, [$cc], #16 @ load rk1 3067 3068 lsr $rctr32x, $ctr96_t32x, #32 3069 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 3070 fmov $ctr3d, $ctr96_b64x @ CTR block 3 3071 3072 rev $rctr32w, $rctr32w @ rev_ctr32 3073 fmov $ctr1d, $ctr96_b64x @ CTR block 1 3074 3075 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 3076 ld1 {$rk2s}, [$cc], #16 @ load rk2 3077 3078 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 3079 rev $ctr32w, $rctr32w @ CTR block 1 3080 3081 add $rctr32w, $rctr32w, #1 @ CTR block 1 3082 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 3083 ld1 {$rk3s}, [$cc], #16 @ load rk3 3084 3085 fmov $ctr1.d[1], $ctr32x @ CTR block 1 3086 rev $ctr32w, $rctr32w @ CTR block 2 3087 add $rctr32w, $rctr32w, #1 @ CTR block 2 3088 3089 fmov $ctr2d, $ctr96_b64x @ CTR block 2 3090 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 3091 3092 fmov $ctr2.d[1], $ctr32x @ CTR block 2 3093 rev $ctr32w, $rctr32w @ CTR block 3 3094 3095 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 3096 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 3097 3098 fmov $ctr3.d[1], $ctr32x @ CTR block 3 3099 3100 ld1 {$rk4s}, [$cc], #16 @ load rk4 3101 3102 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 3103 3104 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 3105 ld1 {$rk5s}, [$cc], #16 @ load rk5 3106 3107 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 3108 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 3109#ifndef __AARCH64EB__ 3110 ext $h4b, $h4b, $h4b, #8 3111#endif 3112 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 3113 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 3114#ifndef __AARCH64EB__ 3115 ext $h2b, $h2b, $h2b, #8 3116#endif 3117 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 3118 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 3119#ifndef __AARCH64EB__ 3120 ext $h3b, $h3b, $h3b, #8 3121#endif 3122 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 3123 3124 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 3125 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 3126#ifndef __AARCH64EB__ 3127 ext $h1b, $h1b, $h1b, #8 3128#endif 3129 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 3130 ld1 {$rk6s}, [$cc], #16 @ load rk6 3131 3132 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 3133 ld1 {$rk7s}, [$cc], #16 @ load rk7 3134 3135 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 3136 ld1 {$rk8s}, [$cc], #16 @ load rk8 3137 3138 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 3139 ld1 {$rk9s}, [$cc], #16 @ load rk9 3140 3141 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 3142 ld1 { $acc_lb}, [$current_tag] 3143 ext $acc_lb, $acc_lb, $acc_lb, #8 3144 rev64 $acc_lb, $acc_lb 3145 3146 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 3147 add $rctr32w, $rctr32w, #1 @ CTR block 3 3148 3149 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 3150 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 3151 3152 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 3153 ld1 {$rk10s}, [$cc], #16 @ load rk10 3154 3155 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 3156 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 3157 3158 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 3159 3160 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 3161 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 3162 3163 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 3164 ld1 {$rk11s}, [$cc], #16 @ load rk11 3165 3166 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 3167 3168 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 3169 3170 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 3171 3172 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 3173 3174 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 3175 3176 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 3177 3178 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 3179 3180 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 3181 3182 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 3183 3184 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 3185 3186 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 3187 3188 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 3189 3190 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 3191 3192 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 3193 3194 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 3195 3196 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 3197 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 3198 3199 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 3200 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3201 3202 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 3203 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 3204 3205 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 3206 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 3207 3208 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 3209 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 3210 3211 aese $ctr3b, $rk11 @ AES block 3 - round 11 3212 3213 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 3214 3215 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 3216 3217 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 3218 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 3219 3220 aese $ctr2b, $rk11 @ AES block 2 - round 11 3221 3222 aese $ctr1b, $rk11 @ AES block 1 - round 11 3223 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 3224 3225 aese $ctr0b, $rk11 @ AES block 0 - round 11 3226 b.ge .L192_dec_tail @ handle tail 3227 3228 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext 3229 3230 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result 3231 3232 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result 3233 rev $ctr32w, $rctr32w @ CTR block 4 3234 ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext 3235 3236 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low 3237 3238 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high 3239 3240 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low 3241 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 3242 add $rctr32w, $rctr32w, #1 @ CTR block 4 3243 3244 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high 3245 rev64 $res0b, $res0b @ GHASH block 0 3246 3247 fmov $ctr0d, $ctr96_b64x @ CTR block 4 3248 rev64 $res1b, $res1b @ GHASH block 1 3249 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 3250 3251 eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low 3252#ifdef __AARCH64EB__ 3253 rev $output_l1, $output_l1 3254#endif 3255 fmov $ctr0.d[1], $ctr32x @ CTR block 4 3256 rev $ctr32w, $rctr32w @ CTR block 5 3257 3258 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 3259 fmov $ctr1d, $ctr96_b64x @ CTR block 5 3260 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high 3261#ifdef __AARCH64EB__ 3262 rev $output_h1, $output_h1 3263#endif 3264 add $rctr32w, $rctr32w, #1 @ CTR block 5 3265 fmov $ctr1.d[1], $ctr32x @ CTR block 5 3266 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low 3267#ifdef __AARCH64EB__ 3268 rev $output_l0, $output_l0 3269#endif 3270 rev $ctr32w, $rctr32w @ CTR block 6 3271 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high 3272#ifdef __AARCH64EB__ 3273 rev $output_h0, $output_h0 3274#endif 3275 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result 3276 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 3277 3278 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result 3279 3280 add $rctr32w, $rctr32w, #1 @ CTR block 6 3281 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result 3282 b.ge .L192_dec_prepretail @ do prepretail 3283 3284 .L192_dec_main_loop: @ main loop start 3285 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 3286 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 3287 3288 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 3289 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 3290 3291 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 3292 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 3293 rev64 $res3b, $res3b @ GHASH block 4k+3 3294 3295 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 3296 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 3297 3298 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 3299 eor $res0b, $res0b, $acc_lb @ PRE 1 3300 3301 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 3302 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 3303 3304 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 3305 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 3306 3307 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 3308 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 3309 3310 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 3311 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 3312 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 3313 3314 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 3315 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 3316 rev $ctr32w, $rctr32w @ CTR block 4k+7 3317 3318 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 3319 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 3320 3321 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 3322 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 3323 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 3324 3325 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 3326 3327 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 3328 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high 3329#ifdef __AARCH64EB__ 3330 rev $output_h2, $output_h2 3331#endif 3332 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 3333 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 3334 3335 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 3336 3337 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 3338 rev64 $res2b, $res2b @ GHASH block 4k+2 3339 3340 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 3341 3342 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 3343 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 3344 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low 3345#ifdef __AARCH64EB__ 3346 rev $output_l2, $output_l2 3347#endif 3348 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 3349 3350 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 3351 3352 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 3353 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 3354 3355 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 3356 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 3357 3358 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 3359 3360 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 3361 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 3362 3363 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 3364 3365 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 3366 3367 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 3368 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 3369 3370 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 3371 3372 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 3373 3374 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 3375 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 3376 3377 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 3378 3379 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 3380 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 3381 3382 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 3383 3384 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 3385 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 3386 3387 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 3388 3389 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 3390 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 3391 3392 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 3393 3394 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 3395 movi $mod_constant.8b, #0xc2 3396 3397 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 3398 3399 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 3400 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 3401 3402 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 3403 3404 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 3405 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 3406 3407 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 3408 3409 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 3410 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 3411 3412 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 3413 3414 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 3415 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 3416 3417 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 3418 3419 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 3420 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 3421 3422 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 3423 3424 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 3425 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 3426 3427 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 3428 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 3429 3430 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 3431 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext 3432 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low 3433#ifdef __AARCH64EB__ 3434 rev $output_l3, $output_l3 3435#endif 3436 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 3437 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 3438 3439 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 3440 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 3441 3442 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 3443 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 3444 3445 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 3446 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext 3447 3448 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 3449 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext 3450 rev $ctr32w, $rctr32w @ CTR block 4k+8 3451 3452 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 3453 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 3454 3455 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 3456 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 3457 3458 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 3459 3460 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result 3461 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high 3462#ifdef __AARCH64EB__ 3463 rev $output_h3, $output_h3 3464#endif 3465 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result 3466 3467 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 3468 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 3469 3470 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 3471 3472 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 3473 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low 3474 3475 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 3476 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 3477 rev64 $res1b, $res1b @ GHASH block 4k+5 3478 3479 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 3480 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 3481 3482 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 3483 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high 3484 3485 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 3486 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 3487 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 3488 3489 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result 3490 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 3491 rev $ctr32w, $rctr32w @ CTR block 4k+9 3492 3493 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low 3494#ifdef __AARCH64EB__ 3495 rev $output_l0, $output_l0 3496#endif 3497 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 3498 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 3499 3500 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 3501 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 3502 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low 3503#ifdef __AARCH64EB__ 3504 rev $output_l1, $output_l1 3505#endif 3506 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 3507 rev $ctr32w, $rctr32w @ CTR block 4k+10 3508 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high 3509#ifdef __AARCH64EB__ 3510 rev $output_h1, $output_h1 3511#endif 3512 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high 3513#ifdef __AARCH64EB__ 3514 rev $output_h0, $output_h0 3515#endif 3516 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result 3517 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3518 3519 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 3520 rev64 $res0b, $res0b @ GHASH block 4k+4 3521 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 3522 3523 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 3524 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result 3525 b.lt .L192_dec_main_loop 3526 3527 .L192_dec_prepretail: @ PREPRETAIL 3528 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 3529 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 3530 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 3531 3532 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 3533 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 3534 3535 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 3536 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 3537 3538 eor $res0b, $res0b, $acc_lb @ PRE 1 3539 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 3540 3541 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 3542 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 3543 3544 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 3545 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 3546 3547 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 3548 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 3549 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 3550 3551 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 3552 rev64 $res2b, $res2b @ GHASH block 4k+2 3553 3554 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 3555 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 3556 rev $ctr32w, $rctr32w @ CTR block 4k+7 3557 3558 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 3559 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 3560 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 3561 3562 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 3563 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high 3564#ifdef __AARCH64EB__ 3565 rev $output_h3, $output_h3 3566#endif 3567 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 3568 3569 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 3570 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low 3571#ifdef __AARCH64EB__ 3572 rev $output_l2, $output_l2 3573#endif 3574 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 3575 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high 3576#ifdef __AARCH64EB__ 3577 rev $output_h2, $output_h2 3578#endif 3579 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 3580 3581 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 3582 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low 3583#ifdef __AARCH64EB__ 3584 rev $output_l3, $output_l3 3585#endif 3586 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 3587 3588 rev64 $res3b, $res3b @ GHASH block 4k+3 3589 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 3590 3591 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 3592 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 3593 3594 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 3595 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 3596 3597 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 3598 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 3599 3600 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 3601 3602 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 3603 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 3604 3605 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 3606 3607 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 3608 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 3609 3610 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 3611 3612 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 3613 3614 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 3615 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 3616 3617 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 3618 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 3619 3620 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 3621 3622 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 3623 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 3624 3625 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 3626 3627 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 3628 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 3629 3630 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 3631 3632 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 3633 movi $mod_constant.8b, #0xc2 3634 3635 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 3636 3637 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 3638 3639 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 3640 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 3641 3642 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 3643 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 3644 3645 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 3646 3647 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 3648 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 3649 3650 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 3651 3652 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 3653 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 3654 3655 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 3656 3657 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 3658 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 3659 3660 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 3661 3662 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 3663 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 3664 3665 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 3666 3667 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 3668 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 3669 3670 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 3671 3672 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 3673 3674 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 3675 3676 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 3677 3678 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 3679 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 3680 3681 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 3682 3683 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 3684 3685 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 3686 3687 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 3688 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 3689 3690 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 3691 3692 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 3693 3694 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 3695 3696 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 3697 3698 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 3699 3700 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 3701 3702 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 3703 3704 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 3705 3706 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 3707 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 3708 3709 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 3710 3711 aese $ctr0b, $rk11 3712 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 3713 3714 aese $ctr2b, $rk11 3715 3716 aese $ctr1b, $rk11 3717 3718 aese $ctr3b, $rk11 3719 3720 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3721 .L192_dec_tail: @ TAIL 3722 3723 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 3724 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 3725 3726 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result 3727 3728 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 3729 3730 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 3731 3732 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 3733 3734 cmp $main_end_input_ptr, #48 3735 3736 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high 3737#ifdef __AARCH64EB__ 3738 rev $output_h0, $output_h0 3739#endif 3740 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low 3741#ifdef __AARCH64EB__ 3742 rev $output_l0, $output_l0 3743#endif 3744 b.gt .L192_dec_blocks_more_than_3 3745 3746 movi $acc_l.8b, #0 3747 movi $acc_h.8b, #0 3748 3749 mov $ctr3b, $ctr2b 3750 mov $ctr2b, $ctr1b 3751 sub $rctr32w, $rctr32w, #1 3752 3753 movi $acc_m.8b, #0 3754 cmp $main_end_input_ptr, #32 3755 b.gt .L192_dec_blocks_more_than_2 3756 3757 mov $ctr3b, $ctr1b 3758 cmp $main_end_input_ptr, #16 3759 sub $rctr32w, $rctr32w, #1 3760 3761 b.gt .L192_dec_blocks_more_than_1 3762 3763 sub $rctr32w, $rctr32w, #1 3764 b .L192_dec_blocks_less_than_1 3765 .L192_dec_blocks_more_than_3: @ blocks left > 3 3766 rev64 $res0b, $res1b @ GHASH final-3 block 3767 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext 3768 3769 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result 3770 3771 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3772 3773 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result 3774 3775 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 3776 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low 3777 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 3778 3779 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high 3780 3781 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 3782 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 3783 3784 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 3785 3786 eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low 3787#ifdef __AARCH64EB__ 3788 rev $output_l0, $output_l0 3789#endif 3790 movi $t0.8b, #0 @ suppress further partial tag feed in 3791 3792 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 3793 eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high 3794#ifdef __AARCH64EB__ 3795 rev $output_h0, $output_h0 3796#endif 3797 .L192_dec_blocks_more_than_2: @ blocks left > 2 3798 3799 rev64 $res0b, $res1b @ GHASH final-2 block 3800 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext 3801 3802 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3803 3804 movi $t0.8b, #0 @ suppress further partial tag feed in 3805 3806 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result 3807 3808 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 3809 3810 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 3811 3812 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result 3813 3814 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 3815 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high 3816 3817 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 3818 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low 3819 3820 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 3821 3822 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 3823 3824 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 3825 eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high 3826#ifdef __AARCH64EB__ 3827 rev $output_h0, $output_h0 3828#endif 3829 eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low 3830#ifdef __AARCH64EB__ 3831 rev $output_l0, $output_l0 3832#endif 3833 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 3834 .L192_dec_blocks_more_than_1: @ blocks left > 1 3835 3836 rev64 $res0b, $res1b @ GHASH final-1 block 3837 3838 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3839 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext 3840 3841 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 3842 3843 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 3844 3845 eor $ctr0b, $res1b, $ctr3b @ AES final block - result 3846 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result 3847 3848 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 3849 3850 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 3851 3852 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 3853 mov $output_h0, $ctr0.d[1] @ AES final block - mov high 3854 3855 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 3856 mov $output_l0, $ctr0.d[0] @ AES final block - mov low 3857 3858 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 3859 3860 movi $t0.8b, #0 @ suppress further partial tag feed in 3861 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 3862 eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high 3863#ifdef __AARCH64EB__ 3864 rev $output_h0, $output_h0 3865#endif 3866 eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low 3867#ifdef __AARCH64EB__ 3868 rev $output_l0, $output_l0 3869#endif 3870 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 3871 .L192_dec_blocks_less_than_1: @ blocks left <= 1 3872 3873 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff 3874 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite 3875 and $bit_length, $bit_length, #127 @ bit_length %= 128 3876 3877 sub $bit_length, $bit_length, #128 @ bit_length -= 128 3878 3879 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 3880 3881 and $bit_length, $bit_length, #127 @ bit_length %= 128 3882 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff 3883 3884 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block 3885 cmp $bit_length, #64 3886 3887 csel $ctr32x, $rk12_l, $rk12_h, lt 3888 csel $ctr96_b64x, $rk12_h, xzr, lt 3889 3890 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block 3891 and $output_l0, $output_l0, $ctr32x 3892 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes 3893 3894 orr $output_l0, $output_l0, $end_input_ptr 3895 mov $ctr0.d[1], $ctr96_b64x 3896#ifndef __AARCH64EB__ 3897 rev $ctr32w, $rctr32w 3898#else 3899 mov $ctr32w, $rctr32w 3900#endif 3901 3902 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 3903 str $ctr32w, [$counter, #12] @ store the updated counter 3904 3905 rev64 $res0b, $res1b @ GHASH final block 3906 3907 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3908 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes 3909 3910 and $output_h0, $output_h0, $ctr96_b64x 3911 3912 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 3913 mov $t0d, $res0.d[1] @ GHASH final block - mid 3914 3915 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 3916 3917 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 3918 3919 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 3920 3921 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 3922 3923 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 3924 3925 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 3926 movi $mod_constant.8b, #0xc2 3927 3928 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 3929 3930 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 3931 3932 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 3933 3934 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 3935 orr $output_h0, $output_h0, $main_end_input_ptr 3936 stp $output_l0, $output_h0, [$output_ptr] 3937 3938 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 3939 3940 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 3941 3942 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 3943 3944 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 3945 3946 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 3947 3948 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 3949 3950 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3951 ext $acc_lb, $acc_lb, $acc_lb, #8 3952 rev64 $acc_lb, $acc_lb 3953 mov x0, $len 3954 st1 { $acc_l.16b }, [$current_tag] 3955 3956 ldp x21, x22, [sp, #16] 3957 ldp x23, x24, [sp, #32] 3958 ldp d8, d9, [sp, #48] 3959 ldp d10, d11, [sp, #64] 3960 ldp d12, d13, [sp, #80] 3961 ldp d14, d15, [sp, #96] 3962 ldp x19, x20, [sp], #112 3963 ret 3964 3965.L192_dec_ret: 3966 mov w0, #0x0 3967 ret 3968.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 3969___ 3970} 3971 3972{ 3973my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); 3974my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); 3975my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); 3976my ($output_l0,$output_h0)=map("x$_",(6..7)); 3977 3978my $ctr32w="w9"; 3979my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15)); 3980my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); 3981 3982my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); 3983my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); 3984my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); 3985my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); 3986 3987my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); 3988my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); 3989my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); 3990 3991my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); 3992my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); 3993my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); 3994 3995my $t0="v8"; 3996my $t0d="d8"; 3997my $t1="v4"; 3998my $t1d="d4"; 3999my $t2="v8"; 4000my $t2d="d8"; 4001my $t3="v4"; 4002my $t3d="d4"; 4003my $t4="v4"; 4004my $t4d="d4"; 4005my $t5="v5"; 4006my $t5d="d5"; 4007my $t6="v8"; 4008my $t6d="d8"; 4009my $t7="v5"; 4010my $t7d="d5"; 4011my $t8="v6"; 4012my $t8d="d6"; 4013my $t9="v4"; 4014my $t9d="d4"; 4015 4016my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); 4017my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); 4018my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); 4019 4020my $mod_constantd="d8"; 4021my $mod_constant="v8"; 4022my $mod_t="v7"; 4023 4024my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31)); 4025my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31)); 4026my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31)); 4027my $rk2q1="v20.1q"; 4028my $rk3q1="v21.1q"; 4029my $rk4v="v22"; 4030my $rk4d="d22"; 4031 4032######################################################################################### 4033# size_t aes_gcm_enc_256_kernel(const unsigned char *in, 4034# size_t len, 4035# unsigned char *out, 4036# const void *key, 4037# unsigned char ivec[16], 4038# u64 *Xi); 4039# 4040$code.=<<___; 4041.global aes_gcm_enc_256_kernel 4042.type aes_gcm_enc_256_kernel,%function 4043.align 4 4044aes_gcm_enc_256_kernel: 4045 cbz x1, .L256_enc_ret 4046 stp x19, x20, [sp, #-112]! 4047 mov x16, x4 4048 mov x8, x5 4049 stp x21, x22, [sp, #16] 4050 stp x23, x24, [sp, #32] 4051 stp d8, d9, [sp, #48] 4052 stp d10, d11, [sp, #64] 4053 stp d12, d13, [sp, #80] 4054 stp d14, d15, [sp, #96] 4055 4056 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 4057 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 4058 mov $len, $main_end_input_ptr 4059 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 4060#ifdef __AARCH64EB__ 4061 rev $ctr96_b64x, $ctr96_b64x 4062 rev $ctr96_t32x, $ctr96_t32x 4063#endif 4064 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 4065#ifdef __AARCH64EB__ 4066 ror $rk14_l, $rk14_l, #32 4067 ror $rk14_h, $rk14_h, #32 4068#endif 4069 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 4070 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 4071 4072 ld1 {$rk0s}, [$cc], #16 @ load rk0 4073 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4074 4075 ld1 {$rk1s}, [$cc], #16 @ load rk1 4076 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 4077 4078 lsr $rctr32x, $ctr96_t32x, #32 4079 fmov $ctr2d, $ctr96_b64x @ CTR block 2 4080 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 4081 4082 rev $rctr32w, $rctr32w @ rev_ctr32 4083 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 4084 fmov $ctr1d, $ctr96_b64x @ CTR block 1 4085 4086 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 4087 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 4088 4089 rev $ctr32w, $rctr32w @ CTR block 1 4090 fmov $ctr3d, $ctr96_b64x @ CTR block 3 4091 4092 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 4093 add $rctr32w, $rctr32w, #1 @ CTR block 1 4094 ld1 {$rk2s}, [$cc], #16 @ load rk2 4095 4096 fmov $ctr1.d[1], $ctr32x @ CTR block 1 4097 rev $ctr32w, $rctr32w @ CTR block 2 4098 add $rctr32w, $rctr32w, #1 @ CTR block 2 4099 4100 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 4101 ld1 {$rk3s}, [$cc], #16 @ load rk3 4102 4103 fmov $ctr2.d[1], $ctr32x @ CTR block 2 4104 rev $ctr32w, $rctr32w @ CTR block 3 4105 4106 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 4107 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 4108 4109 fmov $ctr3.d[1], $ctr32x @ CTR block 3 4110 4111 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 4112 ld1 {$rk4s}, [$cc], #16 @ load rk4 4113 4114 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 4115 ld1 {$rk5s}, [$cc], #16 @ load rk5 4116 4117 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 4118 ld1 {$rk6s}, [$cc], #16 @ load rk6 4119 4120 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 4121 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 4122#ifndef __AARCH64EB__ 4123 ext $h3b, $h3b, $h3b, #8 4124#endif 4125 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 4126 ld1 {$rk7s}, [$cc], #16 @ load rk7 4127 4128 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 4129 ld1 {$rk8s}, [$cc], #16 @ load rk8 4130 4131 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 4132 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 4133#ifndef __AARCH64EB__ 4134 ext $h2b, $h2b, $h2b, #8 4135#endif 4136 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 4137 ld1 {$rk9s}, [$cc], #16 @ load rk9 4138 4139 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 4140 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 4141#ifndef __AARCH64EB__ 4142 ext $h4b, $h4b, $h4b, #8 4143#endif 4144 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 4145 ld1 {$rk10s}, [$cc], #16 @ load rk10 4146 4147 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 4148 ld1 {$rk11s}, [$cc], #16 @ load rk11 4149 4150 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 4151 add $rctr32w, $rctr32w, #1 @ CTR block 3 4152 4153 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 4154 4155 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 4156 ld1 { $acc_lb}, [$current_tag] 4157 ext $acc_lb, $acc_lb, $acc_lb, #8 4158 rev64 $acc_lb, $acc_lb 4159 4160 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 4161 4162 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 4163 4164 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 4165 4166 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 4167 4168 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 4169 4170 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 4171 4172 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 4173 4174 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 4175 4176 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 4177 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 4178 4179 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 4180 ld1 {$rk12s}, [$cc], #16 @ load rk12 4181 4182 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 4183 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 4184#ifndef __AARCH64EB__ 4185 ext $h1b, $h1b, $h1b, #8 4186#endif 4187 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 4188 ld1 {$rk13s}, [$cc], #16 @ load rk13 4189 4190 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 4191 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 4192 4193 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 4194 4195 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 4196 4197 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 4198 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 4199 4200 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 4201 4202 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 4203 4204 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 4205 4206 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 4207 4208 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 4209 4210 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 4211 4212 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 4213 4214 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 4215 4216 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 4217 4218 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 4219 4220 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 4221 4222 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 4223 4224 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 4225 4226 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 4227 4228 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 4229 4230 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 4231 4232 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 4233 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 4234 4235 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 4236 4237 aese $ctr2b, $rk13 @ AES block 2 - round 13 4238 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 4239 4240 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 4241 4242 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 4243 4244 aese $ctr1b, $rk13 @ AES block 1 - round 13 4245 4246 aese $ctr0b, $rk13 @ AES block 0 - round 13 4247 4248 aese $ctr3b, $rk13 @ AES block 3 - round 13 4249 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 4250 b.ge .L256_enc_tail @ handle tail 4251 4252 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext 4253#ifdef __AARCH64EB__ 4254 rev $input_l1, $input_l1 4255 rev $input_h1, $input_h1 4256#endif 4257 rev $ctr32w, $rctr32w @ CTR block 4 4258 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext 4259#ifdef __AARCH64EB__ 4260 rev $input_l0, $input_l0 4261 rev $input_h0, $input_h0 4262#endif 4263 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext 4264#ifdef __AARCH64EB__ 4265 rev $input_l3, $input_l3 4266 rev $input_h3, $input_h3 4267#endif 4268 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext 4269#ifdef __AARCH64EB__ 4270 rev $input_l2, $input_l2 4271 rev $input_h2, $input_h2 4272#endif 4273 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 4274 4275 eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low 4276 eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high 4277 4278 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low 4279 eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low 4280 4281 eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high 4282 eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high 4283 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low 4284 4285 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 4286 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high 4287 eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low 4288 4289 eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low 4290 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high 4291 4292 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low 4293 add $rctr32w, $rctr32w, #1 @ CTR block 4 4294 4295 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 4296 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low 4297 eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high 4298 4299 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high 4300 4301 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result 4302 fmov $ctr0d, $ctr96_b64x @ CTR block 4 4303 4304 fmov $ctr0.d[1], $ctr32x @ CTR block 4 4305 rev $ctr32w, $rctr32w @ CTR block 5 4306 add $rctr32w, $rctr32w, #1 @ CTR block 5 4307 4308 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result 4309 fmov $ctr1d, $ctr96_b64x @ CTR block 5 4310 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 4311 4312 fmov $ctr1.d[1], $ctr32x @ CTR block 5 4313 rev $ctr32w, $rctr32w @ CTR block 6 4314 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result 4315 4316 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high 4317 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 4318 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result 4319 4320 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result 4321 4322 add $rctr32w, $rctr32w, #1 @ CTR block 6 4323 fmov $ctr2d, $ctr96_b64x @ CTR block 6 4324 4325 fmov $ctr2.d[1], $ctr32x @ CTR block 6 4326 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result 4327 rev $ctr32w, $rctr32w @ CTR block 7 4328 4329 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 4330 4331 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result 4332 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result 4333 b.ge L256_enc_prepretail @ do prepretail 4334 4335 .L256_enc_main_loop: @ main loop start 4336 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 4337 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 4338 4339 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 4340 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 4341 4342 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 4343 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 4344 4345 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 4346 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 4347 4348 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 4349 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext 4350#ifdef __AARCH64EB__ 4351 rev $input_l3, $input_l3 4352 rev $input_h3, $input_h3 4353#endif 4354 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 4355 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext 4356#ifdef __AARCH64EB__ 4357 rev $input_l2, $input_l2 4358 rev $input_h2, $input_h2 4359#endif 4360 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 4361 eor $res0b, $res0b, $acc_lb @ PRE 1 4362 4363 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 4364 4365 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 4366 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low 4367 4368 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 4369 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 4370 4371 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 4372 eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high 4373 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 4374 4375 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 4376 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 4377 4378 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 4379 4380 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 4381 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 4382 4383 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 4384 4385 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 4386 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4387 4388 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 4389 4390 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 4391 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 4392 4393 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 4394 4395 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 4396 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 4397 4398 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 4399 4400 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 4401 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 4402 4403 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 4404 4405 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 4406 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 4407 4408 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 4409 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 4410 4411 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 4412 4413 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 4414 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 4415 4416 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 4417 4418 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 4419 4420 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 4421 4422 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 4423 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 4424 4425 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 4426 4427 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 4428 4429 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 4430 4431 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 4432 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 4433 4434 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 4435 4436 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 4437 4438 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 4439 4440 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 4441 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 4442 4443 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 4444 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext 4445#ifdef __AARCH64EB__ 4446 rev $input_l1, $input_l1 4447 rev $input_h1, $input_h1 4448#endif 4449 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 4450 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 4451 4452 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 4453 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 4454 4455 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 4456 4457 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 4458 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 4459 4460 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 4461 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low 4462 4463 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 4464 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 4465 4466 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 4467 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low 4468 4469 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 4470 movi $mod_constant.8b, #0xc2 4471 4472 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 4473 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 4474 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low 4475 4476 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 4477 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext 4478#ifdef __AARCH64EB__ 4479 rev $input_l0, $input_l0 4480 rev $input_h0, $input_h0 4481#endif 4482 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 4483 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 4484 4485 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 4486 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 4487 4488 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 4489 4490 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 4491 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 4492 4493 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 4494 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 4495 4496 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 4497 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 4498 4499 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 4500 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 4501 4502 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 4503 rev $ctr32w, $rctr32w @ CTR block 4k+8 4504 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 4505 4506 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 4507 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low 4508 4509 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 4510 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 4511 4512 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 4513 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high 4514 4515 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 4516 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 4517 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid 4518 4519 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 4520 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high 4521 4522 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 4523 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high 4524 4525 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 4526 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 4527 4528 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 4529 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 4530 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 4531 4532 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 4533 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low 4534 4535 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 4536 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high 4537 4538 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low 4539 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 4540 4541 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high 4542 4543 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 4544 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 4545 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 4546 4547 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 4548 rev $ctr32w, $rctr32w @ CTR block 4k+9 4549 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 4550 4551 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result 4552 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 4553 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 4554 4555 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 4556 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 4557 4558 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 4559 rev $ctr32w, $rctr32w @ CTR block 4k+10 4560 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result 4561 4562 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 4563 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 4564 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high 4565 4566 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 4567 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result 4568 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 4569 4570 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 4571 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result 4572 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 4573 4574 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result 4575 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 4576 rev $ctr32w, $rctr32w @ CTR block 4k+11 4577 4578 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 4579 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 4580 4581 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result 4582 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result 4583 b.lt L256_enc_main_loop 4584 4585 .L256_enc_prepretail: @ PREPRETAIL 4586 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 4587 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 4588 4589 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 4590 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 4591 4592 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 4593 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 4594 4595 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 4596 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 4597 4598 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 4599 4600 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 4601 4602 eor $res0b, $res0b, $acc_lb @ PRE 1 4603 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 4604 4605 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 4606 4607 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 4608 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 4609 4610 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 4611 4612 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 4613 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 4614 4615 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 4616 4617 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 4618 4619 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 4620 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 4621 4622 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 4623 4624 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 4625 4626 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 4627 4628 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 4629 4630 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 4631 4632 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 4633 4634 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 4635 4636 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 4637 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 4638 4639 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 4640 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 4641 4642 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 4643 4644 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 4645 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 4646 4647 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 4648 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4649 4650 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 4651 4652 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 4653 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 4654 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 4655 4656 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 4657 4658 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 4659 4660 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 4661 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 4662 4663 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 4664 4665 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 4666 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 4667 4668 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 4669 4670 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 4671 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 4672 4673 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 4674 4675 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 4676 4677 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 4678 4679 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 4680 4681 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 4682 4683 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 4684 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 4685 4686 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 4687 4688 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 4689 4690 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 4691 4692 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 4693 movi $mod_constant.8b, #0xc2 4694 4695 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 4696 4697 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 4698 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 4699 4700 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 4701 4702 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 4703 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 4704 4705 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 4706 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 4707 4708 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 4709 4710 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 4711 4712 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 4713 4714 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 4715 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 4716 4717 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 4718 4719 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up 4720 4721 pmull $t1.1q, $acc_h.1d, $mod_constant.1d 4722 ext $acc_hb, $acc_hb, $acc_hb, #8 4723 4724 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 4725 4726 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 4727 eor $acc_mb, $acc_mb, $acc_lb 4728 4729 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 4730 4731 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 4732 4733 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 4734 4735 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 4736 eor $acc_mb, $acc_mb, $t1.16b 4737 4738 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 4739 4740 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 4741 4742 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 4743 4744 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 4745 eor $acc_mb, $acc_mb, $acc_hb 4746 4747 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 4748 4749 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 4750 4751 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 4752 4753 pmull $t1.1q, $acc_m.1d, $mod_constant.1d 4754 4755 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 4756 ext $acc_mb, $acc_mb, $acc_mb, #8 4757 4758 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 4759 4760 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 4761 eor $acc_lb, $acc_lb, $t1.16b 4762 4763 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 4764 4765 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 4766 4767 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 4768 4769 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 4770 eor $acc_lb, $acc_lb, $acc_mb 4771 .L256_enc_tail: @ TAIL 4772 4773 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 4774 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 4775 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext 4776#ifdef __AARCH64EB__ 4777 rev $input_l0, $input_l0 4778 rev $input_h0, $input_h0 4779#endif 4780 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low 4781 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high 4782 4783 cmp $main_end_input_ptr, #48 4784 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 4785 4786 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 4787 4788 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 4789 b.gt .L256_enc_blocks_more_than_3 4790 4791 cmp $main_end_input_ptr, #32 4792 mov $ctr3b, $ctr2b 4793 movi $acc_l.8b, #0 4794 4795 movi $acc_h.8b, #0 4796 sub $rctr32w, $rctr32w, #1 4797 4798 mov $ctr2b, $ctr1b 4799 movi $acc_m.8b, #0 4800 b.gt .L256_enc_blocks_more_than_2 4801 4802 mov $ctr3b, $ctr1b 4803 sub $rctr32w, $rctr32w, #1 4804 cmp $main_end_input_ptr, #16 4805 4806 b.gt .L256_enc_blocks_more_than_1 4807 4808 sub $rctr32w, $rctr32w, #1 4809 b .L256_enc_blocks_less_than_1 4810 .L256_enc_blocks_more_than_3: @ blocks left > 3 4811 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result 4812 4813 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high 4814#ifdef __AARCH64EB__ 4815 rev $input_l0, $input_l0 4816 rev $input_h0, $input_h0 4817#endif 4818 rev64 $res0b, $res1b @ GHASH final-3 block 4819 4820 eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low 4821 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4822 4823 eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high 4824 4825 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 4826 fmov $res1d, $input_l0 @ AES final-2 block - mov low 4827 4828 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high 4829 4830 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 4831 movi $t0.8b, #0 @ suppress further partial tag feed in 4832 4833 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 4834 4835 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 4836 4837 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 4838 4839 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 4840 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result 4841 .L256_enc_blocks_more_than_2: @ blocks left > 2 4842 4843 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result 4844 4845 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high 4846#ifdef __AARCH64EB__ 4847 rev $input_l0, $input_l0 4848 rev $input_h0, $input_h0 4849#endif 4850 rev64 $res0b, $res1b @ GHASH final-2 block 4851 4852 eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low 4853 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4854 4855 fmov $res1d, $input_l0 @ AES final-1 block - mov low 4856 eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high 4857 4858 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high 4859 4860 movi $t0.8b, #0 @ suppress further partial tag feed in 4861 4862 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 4863 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 4864 4865 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 4866 4867 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 4868 4869 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result 4870 4871 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 4872 4873 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 4874 4875 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 4876 4877 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 4878 .L256_enc_blocks_more_than_1: @ blocks left > 1 4879 4880 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result 4881 4882 rev64 $res0b, $res1b @ GHASH final-1 block 4883 4884 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high 4885#ifdef __AARCH64EB__ 4886 rev $input_l0, $input_l0 4887 rev $input_h0, $input_h0 4888#endif 4889 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4890 4891 movi $t0.8b, #0 @ suppress further partial tag feed in 4892 4893 eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low 4894 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 4895 4896 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 4897 eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high 4898 4899 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 4900 4901 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 4902 4903 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 4904 fmov $res1d, $input_l0 @ AES final block - mov low 4905 4906 fmov $res1.d[1], $input_h0 @ AES final block - mov high 4907 4908 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 4909 4910 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 4911 4912 eor $res1b, $res1b, $ctr3b @ AES final block - result 4913 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 4914 4915 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 4916 .L256_enc_blocks_less_than_1: @ blocks left <= 1 4917 4918 and $bit_length, $bit_length, #127 @ bit_length %= 128 4919 4920 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff 4921 sub $bit_length, $bit_length, #128 @ bit_length -= 128 4922 4923 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 4924 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored 4925 4926 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff 4927 and $bit_length, $bit_length, #127 @ bit_length %= 128 4928 4929 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block 4930 cmp $bit_length, #64 4931 4932 csel $input_l0, $rk14_l, $rk14_h, lt 4933 csel $input_h0, $rk14_h, xzr, lt 4934 4935 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block 4936 4937 fmov $ctr0.d[1], $input_h0 4938 4939 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 4940 4941 rev64 $res0b, $res1b @ GHASH final block 4942 4943 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4944 4945 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing 4946 4947 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 4948 mov $t0d, $res0.d[1] @ GHASH final block - mid 4949#ifndef __AARCH64EB__ 4950 rev $ctr32w, $rctr32w 4951#else 4952 mov $ctr32w, $rctr32w 4953#endif 4954 4955 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 4956 4957 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 4958 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 4959 4960 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 4961 4962 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 4963 4964 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 4965 movi $mod_constant.8b, #0xc2 4966 4967 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 4968 4969 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 4970 4971 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 4972 4973 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 4974 4975 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 4976 4977 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 4978 4979 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 4980 4981 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 4982 4983 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 4984 4985 str $ctr32w, [$counter, #12] @ store the updated counter 4986 4987 st1 { $res1b}, [$output_ptr] @ store all 16B 4988 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 4989 4990 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 4991 ext $acc_lb, $acc_lb, $acc_lb, #8 4992 rev64 $acc_lb, $acc_lb 4993 mov x0, $len 4994 st1 { $acc_l.16b }, [$current_tag] 4995 4996 ldp x21, x22, [sp, #16] 4997 ldp x23, x24, [sp, #32] 4998 ldp d8, d9, [sp, #48] 4999 ldp d10, d11, [sp, #64] 5000 ldp d12, d13, [sp, #80] 5001 ldp d14, d15, [sp, #96] 5002 ldp x19, x20, [sp], #112 5003 ret 5004 5005.L256_enc_ret: 5006 mov w0, #0x0 5007 ret 5008.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5009___ 5010 5011{ 5012my $t8="v4"; 5013my $t8d="d4"; 5014my $t9="v6"; 5015my $t9d="d6"; 5016######################################################################################### 5017# size_t aes_gcm_dec_256_kernel(const unsigned char *in, 5018# size_t len, 5019# unsigned char *out, 5020# const void *key, 5021# unsigned char ivec[16], 5022# u64 *Xi); 5023# 5024$code.=<<___; 5025.global aes_gcm_dec_256_kernel 5026.type aes_gcm_dec_256_kernel,%function 5027.align 4 5028aes_gcm_dec_256_kernel: 5029 cbz x1, .L256_dec_ret 5030 stp x19, x20, [sp, #-112]! 5031 mov x16, x4 5032 mov x8, x5 5033 stp x21, x22, [sp, #16] 5034 stp x23, x24, [sp, #32] 5035 stp d8, d9, [sp, #48] 5036 stp d10, d11, [sp, #64] 5037 stp d12, d13, [sp, #80] 5038 stp d14, d15, [sp, #96] 5039 5040 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 5041 mov $len, $main_end_input_ptr 5042 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 5043#ifdef __AARCH64EB__ 5044 rev $ctr96_b64x, $ctr96_b64x 5045 rev $ctr96_t32x, $ctr96_t32x 5046#endif 5047 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 5048#ifdef __AARCH64EB__ 5049 ror $rk14_h, $rk14_h, #32 5050 ror $rk14_l, $rk14_l, #32 5051#endif 5052 ld1 {$rk0s}, [$cc], #16 @ load rk0 5053 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 5054 5055 ld1 {$rk1s}, [$cc], #16 @ load rk1 5056 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5057 5058 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 5059 ld1 {$rk2s}, [$cc], #16 @ load rk2 5060 5061 lsr $rctr32x, $ctr96_t32x, #32 5062 ld1 {$rk3s}, [$cc], #16 @ load rk3 5063 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 5064 5065 ld1 {$rk4s}, [$cc], #16 @ load rk4 5066 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 5067 rev $rctr32w, $rctr32w @ rev_ctr32 5068 5069 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 5070 fmov $ctr3d, $ctr96_b64x @ CTR block 3 5071 5072 rev $ctr32w, $rctr32w @ CTR block 1 5073 add $rctr32w, $rctr32w, #1 @ CTR block 1 5074 fmov $ctr1d, $ctr96_b64x @ CTR block 1 5075 5076 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 5077 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 5078 5079 fmov $ctr1.d[1], $ctr32x @ CTR block 1 5080 rev $ctr32w, $rctr32w @ CTR block 2 5081 add $rctr32w, $rctr32w, #1 @ CTR block 2 5082 5083 fmov $ctr2d, $ctr96_b64x @ CTR block 2 5084 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 5085 5086 fmov $ctr2.d[1], $ctr32x @ CTR block 2 5087 rev $ctr32w, $rctr32w @ CTR block 3 5088 5089 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 5090 ld1 {$rk5s}, [$cc], #16 @ load rk5 5091 5092 fmov $ctr3.d[1], $ctr32x @ CTR block 3 5093 add $rctr32w, $rctr32w, #1 @ CTR block 3 5094 5095 ld1 {$rk6s}, [$cc], #16 @ load rk6 5096 5097 ld1 {$rk7s}, [$cc], #16 @ load rk7 5098 5099 ld1 {$rk8s}, [$cc], #16 @ load rk8 5100 5101 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 5102 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 5103#ifndef __AARCH64EB__ 5104 ext $h3b, $h3b, $h3b, #8 5105#endif 5106 5107 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 5108 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 5109#ifndef __AARCH64EB__ 5110 ext $h4b, $h4b, $h4b, #8 5111#endif 5112 5113 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 5114 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 5115#ifndef __AARCH64EB__ 5116 ext $h2b, $h2b, $h2b, #8 5117#endif 5118 5119 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 5120 ld1 {$rk9s}, [$cc], #16 @ load rk9 5121 5122 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 5123 5124 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 5125 ld1 { $acc_lb}, [$current_tag] 5126 ext $acc_lb, $acc_lb, $acc_lb, #8 5127 rev64 $acc_lb, $acc_lb 5128 5129 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 5130 ld1 {$rk10s}, [$cc], #16 @ load rk10 5131 5132 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 5133 ld1 {$rk11s}, [$cc], #16 @ load rk11 5134 5135 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 5136 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 5137#ifndef __AARCH64EB__ 5138 ext $h1b, $h1b, $h1b, #8 5139#endif 5140 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 5141 ld1 {$rk12s}, [$cc], #16 @ load rk12 5142 5143 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 5144 5145 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 5146 5147 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 5148 5149 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 5150 5151 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 5152 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 5153 5154 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 5155 5156 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 5157 5158 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 5159 5160 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 5161 5162 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 5163 5164 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 5165 5166 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 5167 5168 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 5169 5170 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 5171 5172 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 5173 5174 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 5175 5176 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 5177 5178 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 5179 5180 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 5181 5182 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 5183 5184 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 5185 5186 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 5187 5188 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 5189 5190 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 5191 5192 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 5193 5194 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 5195 5196 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 5197 ld1 {$rk13s}, [$cc], #16 @ load rk13 5198 5199 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 5200 5201 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 5202 5203 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 5204 5205 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 5206 5207 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 5208 5209 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 5210 5211 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 5212 5213 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 5214 5215 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 5216 5217 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 5218 5219 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 5220 5221 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 5222 5223 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 5224 5225 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 5226 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 5227 5228 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 5229 5230 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 5231 5232 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 5233 5234 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 5235 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 5236 5237 aese $ctr1b, $rk13 @ AES block 1 - round 13 5238 5239 aese $ctr2b, $rk13 @ AES block 2 - round 13 5240 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 5241 5242 aese $ctr3b, $rk13 @ AES block 3 - round 13 5243 5244 aese $ctr0b, $rk13 @ AES block 0 - round 13 5245 b.ge .L256_dec_tail @ handle tail 5246 5247 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext 5248 5249 rev $ctr32w, $rctr32w @ CTR block 4 5250 5251 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result 5252 5253 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result 5254 rev64 $res1b, $res1b @ GHASH block 1 5255 ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext 5256 5257 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high 5258 5259 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low 5260 rev64 $res0b, $res0b @ GHASH block 0 5261 add $rctr32w, $rctr32w, #1 @ CTR block 4 5262 5263 fmov $ctr0d, $ctr96_b64x @ CTR block 4 5264 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 5265 5266 fmov $ctr0.d[1], $ctr32x @ CTR block 4 5267 rev $ctr32w, $rctr32w @ CTR block 5 5268 add $rctr32w, $rctr32w, #1 @ CTR block 5 5269 5270 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low 5271 5272 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 5273 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high 5274 eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high 5275#ifdef __AARCH64EB__ 5276 rev $output_h0, $output_h0 5277#endif 5278 eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low 5279#ifdef __AARCH64EB__ 5280 rev $output_l0, $output_l0 5281#endif 5282 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result 5283 fmov $ctr1d, $ctr96_b64x @ CTR block 5 5284 5285 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext 5286 5287 fmov $ctr1.d[1], $ctr32x @ CTR block 5 5288 rev $ctr32w, $rctr32w @ CTR block 6 5289 add $rctr32w, $rctr32w, #1 @ CTR block 6 5290 5291 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low 5292#ifdef __AARCH64EB__ 5293 rev $output_l1, $output_l1 5294#endif 5295 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 5296 5297 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high 5298#ifdef __AARCH64EB__ 5299 rev $output_h1, $output_h1 5300#endif 5301 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result 5302 5303 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result 5304 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 5305 b.ge .L256_dec_prepretail @ do prepretail 5306 5307 .L256_dec_main_loop: @ main loop start 5308 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 5309 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 5310 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 5311 5312 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 5313 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 5314 5315 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 5316 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 5317 5318 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 5319 eor $res0b, $res0b, $acc_lb @ PRE 1 5320 rev $ctr32w, $rctr32w @ CTR block 4k+7 5321 5322 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 5323 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 5324 5325 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 5326 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 5327 5328 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 5329 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 5330 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 5331 5332 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 5333 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 5334 5335 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 5336 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 5337 5338 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 5339 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 5340 5341 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 5342 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high 5343#ifdef __AARCH64EB__ 5344 rev $output_h2, $output_h2 5345#endif 5346 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 5347 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 5348 5349 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 5350 rev64 $res2b, $res2b @ GHASH block 4k+2 5351 5352 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 5353 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low 5354#ifdef __AARCH64EB__ 5355 rev $output_l2, $output_l2 5356#endif 5357 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 5358 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 5359 5360 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 5361 5362 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 5363 5364 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 5365 rev64 $res3b, $res3b @ GHASH block 4k+3 5366 5367 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 5368 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low 5369#ifdef __AARCH64EB__ 5370 rev $output_l3, $output_l3 5371#endif 5372 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 5373 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high 5374#ifdef __AARCH64EB__ 5375 rev $output_h3, $output_h3 5376#endif 5377 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 5378 5379 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 5380 5381 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 5382 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 5383 5384 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 5385 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 5386 5387 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 5388 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 5389 5390 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 5391 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 5392 5393 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 5394 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 5395 5396 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 5397 5398 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 5399 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 5400 5401 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 5402 5403 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 5404 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 5405 5406 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 5407 rev $ctr32w, $rctr32w @ CTR block 4k+8 5408 5409 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 5410 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 5411 5412 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 5413 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 5414 5415 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 5416 5417 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 5418 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 5419 5420 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 5421 5422 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 5423 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 5424 5425 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 5426 5427 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 5428 5429 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 5430 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 5431 5432 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 5433 5434 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 5435 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 5436 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 5437 5438 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 5439 5440 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 5441 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 5442 5443 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 5444 5445 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 5446 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 5447 5448 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 5449 5450 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 5451 movi $mod_constant.8b, #0xc2 5452 5453 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 5454 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 5455 5456 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 5457 5458 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 5459 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 5460 5461 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 5462 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 5463 5464 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 5465 5466 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 5467 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 5468 5469 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 5470 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 5471 5472 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 5473 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 5474 5475 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 5476 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 5477 5478 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 5479 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext 5480 5481 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 5482 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result 5483 5484 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 5485 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 5486 5487 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 5488 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 5489 5490 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 5491 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext 5492 5493 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 5494 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext 5495 5496 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 5497 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 5498 5499 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 5500 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 5501 5502 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 5503 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 5504 5505 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 5506 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 5507 5508 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 5509 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 5510 5511 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 5512 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result 5513 rev $ctr32w, $rctr32w @ CTR block 4k+9 5514 5515 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 5516 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 5517 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 5518 5519 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 5520 5521 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low 5522#ifdef __AARCH64EB__ 5523 rev $output_l0, $output_l0 5524#endif 5525 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high 5526#ifdef __AARCH64EB__ 5527 rev $output_h0, $output_h0 5528#endif 5529 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high 5530 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result 5531 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 5532 5533 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 5534 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low 5535 5536 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 5537 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 5538 5539 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 5540 rev $ctr32w, $rctr32w @ CTR block 4k+10 5541 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 5542 5543 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 5544 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 5545 5546 rev64 $res1b, $res1b @ GHASH block 4k+5 5547 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high 5548#ifdef __AARCH64EB__ 5549 rev $output_h1, $output_h1 5550#endif 5551 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result 5552 5553 eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low 5554#ifdef __AARCH64EB__ 5555 rev $output_l1, $output_l1 5556#endif 5557 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result 5558 5559 rev64 $res0b, $res0b @ GHASH block 4k+4 5560 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 5561 b.lt .L256_dec_main_loop 5562 5563 5564 .L256_dec_prepretail: @ PREPRETAIL 5565 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 5566 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 5567 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 5568 5569 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 5570 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 5571 5572 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 5573 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 5574 5575 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 5576 rev $ctr32w, $rctr32w @ CTR block 4k+7 5577 eor $res0b, $res0b, $acc_lb @ PRE 1 5578 5579 rev64 $res2b, $res2b @ GHASH block 4k+2 5580 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 5581 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 5582 5583 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 5584 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 5585 5586 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 5587 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 5588 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 5589 5590 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 5591 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 5592 5593 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 5594 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 5595 5596 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 5597 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 5598 5599 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 5600 5601 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 5602 rev64 $res3b, $res3b @ GHASH block 4k+3 5603 5604 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 5605 5606 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 5607 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 5608 5609 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 5610 5611 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 5612 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 5613 5614 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 5615 5616 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 5617 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 5618 5619 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 5620 5621 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 5622 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 5623 5624 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 5625 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 5626 5627 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 5628 5629 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 5630 5631 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 5632 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 5633 5634 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 5635 5636 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 5637 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 5638 5639 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 5640 5641 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 5642 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 5643 5644 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 5645 5646 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 5647 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 5648 5649 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 5650 5651 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 5652 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 5653 5654 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 5655 5656 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 5657 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 5658 5659 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 5660 5661 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 5662 5663 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 5664 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 5665 5666 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 5667 5668 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 5669 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 5670 5671 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 5672 5673 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 5674 movi $mod_constant.8b, #0xc2 5675 5676 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 5677 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 5678 5679 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 5680 5681 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 5682 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 5683 5684 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 5685 5686 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 5687 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 5688 5689 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 5690 5691 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 5692 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 5693 5694 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 5695 5696 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 5697 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 5698 5699 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 5700 5701 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 5702 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 5703 5704 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 5705 5706 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 5707 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 5708 5709 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 5710 5711 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 5712 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 5713 5714 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 5715 5716 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 5717 5718 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 5719 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high 5720#ifdef __AARCH64EB__ 5721 rev $output_h2, $output_h2 5722#endif 5723 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 5724 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low 5725#ifdef __AARCH64EB__ 5726 rev $output_l3, $output_l3 5727#endif 5728 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 5729 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 5730 5731 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 5732 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 5733 5734 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 5735 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low 5736#ifdef __AARCH64EB__ 5737 rev $output_l2, $output_l2 5738#endif 5739 5740 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 5741 5742 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 5743 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high 5744#ifdef __AARCH64EB__ 5745 rev $output_h3, $output_h3 5746#endif 5747 5748 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 5749 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 5750 5751 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 5752 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 5753 5754 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 5755 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 5756 5757 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 5758 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 5759 5760 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 5761 5762 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 5763 5764 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 5765 5766 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 5767 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 5768 .L256_dec_tail: @ TAIL 5769 5770 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 5771 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 5772 5773 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result 5774 5775 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 5776 5777 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 5778 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 5779 5780 cmp $main_end_input_ptr, #48 5781 5782 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low 5783#ifdef __AARCH64EB__ 5784 rev $output_l0, $output_l0 5785#endif 5786 5787 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high 5788#ifdef __AARCH64EB__ 5789 rev $output_h0, $output_h0 5790#endif 5791 b.gt .L256_dec_blocks_more_than_3 5792 5793 sub $rctr32w, $rctr32w, #1 5794 mov $ctr3b, $ctr2b 5795 movi $acc_m.8b, #0 5796 5797 movi $acc_l.8b, #0 5798 cmp $main_end_input_ptr, #32 5799 5800 movi $acc_h.8b, #0 5801 mov $ctr2b, $ctr1b 5802 b.gt .L256_dec_blocks_more_than_2 5803 5804 sub $rctr32w, $rctr32w, #1 5805 5806 mov $ctr3b, $ctr1b 5807 cmp $main_end_input_ptr, #16 5808 b.gt .L256_dec_blocks_more_than_1 5809 5810 sub $rctr32w, $rctr32w, #1 5811 b .L256_dec_blocks_less_than_1 5812 .L256_dec_blocks_more_than_3: @ blocks left > 3 5813 rev64 $res0b, $res1b @ GHASH final-3 block 5814 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext 5815 5816 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result 5817 5818 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 5819 5820 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5821 5822 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result 5823 5824 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 5825 5826 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low 5827 5828 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high 5829 5830 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 5831 5832 movi $t0.8b, #0 @ suppress further partial tag feed in 5833 5834 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 5835 5836 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 5837 eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low 5838#ifdef __AARCH64EB__ 5839 rev $output_l0, $output_l0 5840#endif 5841 5842 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 5843 eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high 5844#ifdef __AARCH64EB__ 5845 rev $output_h0, $output_h0 5846#endif 5847 .L256_dec_blocks_more_than_2: @ blocks left > 2 5848 5849 rev64 $res0b, $res1b @ GHASH final-2 block 5850 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext 5851 5852 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5853 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result 5854 5855 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result 5856 5857 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 5858 5859 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 5860 5861 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 5862 5863 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 5864 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low 5865 5866 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high 5867 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 5868 movi $t0.8b, #0 @ suppress further partial tag feed in 5869 5870 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 5871 5872 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 5873 eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low 5874#ifdef __AARCH64EB__ 5875 rev $output_l0, $output_l0 5876#endif 5877 5878 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 5879 eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high 5880#ifdef __AARCH64EB__ 5881 rev $output_h0, $output_h0 5882#endif 5883 .L256_dec_blocks_more_than_1: @ blocks left > 1 5884 5885 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result 5886 rev64 $res0b, $res1b @ GHASH final-1 block 5887 5888 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext 5889 5890 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5891 movi $t0.8b, #0 @ suppress further partial tag feed in 5892 5893 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 5894 5895 eor $ctr0b, $res1b, $ctr3b @ AES final block - result 5896 5897 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 5898 5899 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 5900 5901 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 5902 mov $output_l0, $ctr0.d[0] @ AES final block - mov low 5903 5904 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 5905 5906 mov $output_h0, $ctr0.d[1] @ AES final block - mov high 5907 5908 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 5909 eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low 5910#ifdef __AARCH64EB__ 5911 rev $output_l0, $output_l0 5912#endif 5913 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 5914 5915 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 5916 5917 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 5918 eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high 5919#ifdef __AARCH64EB__ 5920 rev $output_h0, $output_h0 5921#endif 5922 .L256_dec_blocks_less_than_1: @ blocks left <= 1 5923 5924 and $bit_length, $bit_length, #127 @ bit_length %= 128 5925 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff 5926 5927 sub $bit_length, $bit_length, #128 @ bit_length -= 128 5928 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff 5929 5930 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite 5931 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 5932 5933 and $bit_length, $bit_length, #127 @ bit_length %= 128 5934 5935 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block 5936 cmp $bit_length, #64 5937 5938 csel $ctr32x, $rk14_l, $rk14_h, lt 5939 csel $ctr96_b64x, $rk14_h, xzr, lt 5940 5941 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block 5942 and $output_l0, $output_l0, $ctr32x 5943 5944 mov $ctr0.d[1], $ctr96_b64x 5945 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes 5946 5947#ifndef __AARCH64EB__ 5948 rev $ctr32w, $rctr32w 5949#else 5950 mov $ctr32w, $rctr32w 5951#endif 5952 5953 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes 5954 5955 orr $output_l0, $output_l0, $end_input_ptr 5956 5957 and $output_h0, $output_h0, $ctr96_b64x 5958 5959 orr $output_h0, $output_h0, $main_end_input_ptr 5960 5961 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 5962 5963 rev64 $res0b, $res1b @ GHASH final block 5964 5965 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5966 5967 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 5968 5969 mov $t0d, $res0.d[1] @ GHASH final block - mid 5970 5971 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 5972 5973 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 5974 5975 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 5976 5977 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 5978 5979 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 5980 5981 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 5982 movi $mod_constant.8b, #0xc2 5983 5984 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 5985 5986 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 5987 5988 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 5989 5990 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 5991 5992 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 5993 5994 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 5995 5996 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 5997 5998 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 5999 6000 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 6001 6002 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 6003 6004 stp $output_l0, $output_h0, [$output_ptr] 6005 6006 str $ctr32w, [$counter, #12] @ store the updated counter 6007 6008 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 6009 ext $acc_lb, $acc_lb, $acc_lb, #8 6010 rev64 $acc_lb, $acc_lb 6011 mov x0, $len 6012 st1 { $acc_l.16b }, [$current_tag] 6013 6014 ldp x21, x22, [sp, #16] 6015 ldp x23, x24, [sp, #32] 6016 ldp d8, d9, [sp, #48] 6017 ldp d10, d11, [sp, #64] 6018 ldp d12, d13, [sp, #80] 6019 ldp d14, d15, [sp, #96] 6020 ldp x19, x20, [sp], #112 6021 ret 6022 6023.L256_dec_ret: 6024 mov w0, #0x0 6025 ret 6026.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6027___ 6028} 6029} 6030 6031$code.=<<___; 6032.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 6033.align 2 6034#endif 6035___ 6036 6037if ($flavour =~ /64/) { ######## 64-bit code 6038 sub unvmov { 6039 my $arg=shift; 6040 6041 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && 6042 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, 6043 $3<8?$3:$3+8,($4 eq "lo")?0:1; 6044 } 6045 foreach(split("\n",$code)) { 6046 s/@\s/\/\//o; # old->new style commentary 6047 print $_,"\n"; 6048 } 6049} else { ######## 32-bit code 6050 sub unvdup32 { 6051 my $arg=shift; 6052 6053 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 6054 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 6055 } 6056 sub unvpmullp64 { 6057 my ($mnemonic,$arg)=@_; 6058 6059 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { 6060 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) 6061 |(($2&7)<<17)|(($2&8)<<4) 6062 |(($3&7)<<1) |(($3&8)<<2); 6063 $word |= 0x00010001 if ($mnemonic =~ "2"); 6064 # since ARMv7 instructions are always encoded little-endian. 6065 # correct solution is to use .inst directive, but older%%%% 6066 # assemblers don't implement it:-( 6067 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 6068 $word&0xff,($word>>8)&0xff, 6069 ($word>>16)&0xff,($word>>24)&0xff, 6070 $mnemonic,$arg; 6071 } 6072 } 6073 6074 foreach(split("\n",$code)) { 6075 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 6076 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 6077 s/\/\/\s?/@ /o; # new->old style commentary 6078 6079 # fix up remaining new-style suffixes 6080 s/\],#[0-9]+/]!/o; 6081 6082 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 6083 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 6084 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or 6085 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 6086 s/^(\s+)b\./$1b/o or 6087 s/^(\s+)ret/$1bx\tlr/o; 6088 6089 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 6090 print " it $2\n"; 6091 } 6092 s/__AARCH64E([BL])__/__ARME$1__/go; 6093 print $_,"\n"; 6094 } 6095} 6096 6097close STDOUT or die "error closing STDOUT: $!"; # enforce flush 6098