1#! /usr/bin/env perl
2# Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
15# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
16#========================================================================
17#
18# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
19#
20# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
21#
22#  ____________________________________________________
23# |                                                    |
24# | PRE                                                |
25# |____________________________________________________|
26# |                |                |                  |
27# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
28# |________________|________________|__________________|
29# |                |                |                  |
30# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
31# |________________|________________|__________________|
32# |                |                |                  |
33# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
34# |________________|________________|__________________|
35# |                |                |                  |
36# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
37# |________________|____(mostly)____|__________________|
38# |                                                    |
39# | MODULO                                             |
40# |____________________________________________________|
41#
42# PRE:
43#     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
44# EXT low_acc, low_acc, low_acc, #8
45# EOR res_curr (4k+0), res_curr (4k+0), low_acc
46#
47# CTR block:
48#     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
49# REV     ctr32, rev_ctr32
50# ORR     ctr64, constctr96_top32, ctr32, LSL #32
51# INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
52# INS     ctr_next.d[1], ctr64X
53# ADD     rev_ctr32, #1
54#
55# AES block:
56#     Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
57#     Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
58#     Given we are very constrained in our ASIMD registers this is quite important
59#
60#     Encrypt:
61# LDR     input_low, [ input_ptr  ], #8
62# LDR     input_high, [ input_ptr  ], #8
63# EOR     input_low, k14_low
64# EOR     input_high, k14_high
65# INS     res_curr.d[0], input_low
66# INS     res_curr.d[1], input_high
67# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
68# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
69# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
70# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
71# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
72# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
73# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
74# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
75# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
76# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
77# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
78# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
79# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
80# AESE    ctr_curr, k13
81# EOR     res_curr, res_curr, ctr_curr
82# ST1     { res_curr.16b  }, [ output_ptr  ], #16
83#
84#     Decrypt:
85# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
86# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
87# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
88# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
89# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
90# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
91# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
92# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
93# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
94# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
95# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
96# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
97# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
98# AESE    ctr_curr, k13
99# LDR     res_curr, [ input_ptr  ], #16
100# EOR     res_curr, res_curr, ctr_curr
101# MOV     output_low, res_curr.d[0]
102# MOV     output_high, res_curr.d[1]
103# EOR     output_low, k14_low
104# EOR     output_high, k14_high
105# STP     output_low, output_high, [ output_ptr  ], #16
106#
107# GHASH block X:
108#     do 128b karatsuba polynomial multiplication on block
109#     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
110#
111# multiplication:
112#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
113#
114#     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
115#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
116#
117#     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
118#     multiplying with "twisted" powers of H
119#
120# Note: We can PMULL directly into the acc_x in first GHASH of the loop
121# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
122#       path latency dominates the performance
123#
124#       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
125#       than indicated here
126# REV64   res_curr, res_curr
127# INS     t_m.d[0], res_curr.d[1]
128# EOR     t_m.8B, t_m.8B, res_curr.8B
129# PMULL2  t_h, res_curr, HX
130# PMULL   t_l, res_curr, HX
131# PMULL   t_m, t_m, HX_k
132# EOR     acc_h, acc_h, t_h
133# EOR     acc_l, acc_l, t_l
134# EOR     acc_m, acc_m, t_m
135#
136# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
137#         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
138#         with a reversed constant
139# EOR     acc_m, acc_m, acc_h
140# EOR     acc_m, acc_m, acc_l                     // Finish off karatsuba processing
141# PMULL   t_mod, acc_h, mod_constant
142# EXT     acc_h, acc_h, acc_h, #8
143# EOR     acc_m, acc_m, acc_h
144# EOR     acc_m, acc_m, t_mod
145# PMULL   acc_h, acc_m, mod_constant
146# EXT     acc_m, acc_m, acc_m, #8
147# EOR     acc_l, acc_l, acc_h
148# EOR     acc_l, acc_l, acc_m
149
150$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
151$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
152
153$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
154( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
155( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
156die "can't locate arm-xlate.pl";
157
158open OUT,"| \"$^X\" $xlate $flavour $output";
159*STDOUT=*OUT;
160
161$input_ptr="x0";  #argument block
162$bit_length="x1";
163$output_ptr="x2";
164$current_tag="x3";
165$counter="x16";
166$cc="x8";
167
168{
169my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
170my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
171my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
172my ($output_l0,$output_h0)=map("x$_",(6..7));
173
174my $ctr32w="w9";
175my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
176my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
177
178my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
179my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
180my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
181my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
182
183my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
184my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
185my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
186
187my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
188my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
189my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
190
191my $t0="v8";
192my $t0d="d8";
193
194my ($t1,$t2,$t3)=map("v$_",(28..30));
195my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
196
197my $t4="v8";
198my $t4d="d8";
199my $t5="v28";
200my $t5d="d28";
201my $t6="v31";
202my $t6d="d31";
203
204my $t7="v4";
205my $t7d="d4";
206my $t8="v29";
207my $t8d="d29";
208my $t9="v30";
209my $t9d="d30";
210
211my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
212my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
213my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
214
215my $mod_constantd="d8";
216my $mod_constant="v8";
217my $mod_t="v31";
218
219my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
220my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27));
221my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
222my $rk2q1="v20.1q";
223my $rk3q1="v21.1q";
224my $rk4v="v22";
225my $rk4d="d22";
226
227$code=<<___;
228#include "arm_arch.h"
229
230#if __ARM_MAX_ARCH__>=8
231___
232$code.=".arch   armv8-a+crypto\n.text\n"    if ($flavour =~ /64/);
233$code.=<<___                    if ($flavour !~ /64/);
234.fpu    neon
235#ifdef __thumb2__
236.syntax        unified
237.thumb
238# define INST(a,b,c,d) $_byte  c,0xef,a,b
239#else
240.code  32
241# define INST(a,b,c,d) $_byte  a,b,c,0xf2
242#endif
243
244.text
245___
246
247#########################################################################################
248# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
249#                               size_t len,
250#                               unsigned char *out,
251#                               const void *key,
252#                               unsigned char ivec[16],
253#                               u64 *Xi);
254#
255$code.=<<___;
256.global aes_gcm_enc_128_kernel
257.type   aes_gcm_enc_128_kernel,%function
258.align  4
259aes_gcm_enc_128_kernel:
260	cbz     x1, .L128_enc_ret
261	stp     x19, x20, [sp, #-112]!
262	mov     x16, x4
263	mov     x8, x5
264	stp     x21, x22, [sp, #16]
265	stp     x23, x24, [sp, #32]
266	stp     d8, d9, [sp, #48]
267	stp     d10, d11, [sp, #64]
268	stp     d12, d13, [sp, #80]
269	stp     d14, d15, [sp, #96]
270
271	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
272#ifdef __AARCH64EB__
273	rev     $ctr96_b64x, $ctr96_b64x
274	rev     $ctr96_t32x, $ctr96_t32x
275#endif
276	ldp     $rk10_l, $rk10_h, [$cc, #160]                     @ load rk10
277#ifdef __AARCH64EB__
278	ror     $rk10_l, $rk10_l, #32
279	ror     $rk10_h, $rk10_h, #32
280#endif
281	ld1     {$acc_lb}, [$current_tag]
282	ext     $acc_lb, $acc_lb, $acc_lb, #8
283	rev64   $acc_lb, $acc_lb
284	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
285	mov     $len, $main_end_input_ptr
286
287	ld1     {$rk0s}, [$cc], #16								  @ load rk0
288	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
289	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
290
291	lsr     $rctr32x, $ctr96_t32x, #32
292	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
293#ifndef __AARCH64EB__
294	ext     $h4b, $h4b, $h4b, #8
295#endif
296	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
297	rev     $rctr32w, $rctr32w                                @ rev_ctr32
298
299	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
300	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
301	ld1     {$rk1s}, [$cc], #16								  @ load rk1
302
303	rev     $ctr32w, $rctr32w                                 @ CTR block 1
304	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
305	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
306
307	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
308	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
309
310	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
311	rev     $ctr32w, $rctr32w                                 @ CTR block 2
312
313	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
314	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
315	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
316
317	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
318	rev     $ctr32w, $rctr32w                                 @ CTR block 3
319
320	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
321	ld1     {$rk2s}, [$cc], #16								  @ load rk2
322
323	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
324	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
325
326	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
327#ifndef __AARCH64EB__
328	ext     $h3b, $h3b, $h3b, #8
329#endif
330	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
331	ld1     {$rk3s}, [$cc], #16								  @ load rk3
332
333	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
334	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
335#ifndef __AARCH64EB__
336	ext     $h1b, $h1b, $h1b, #8
337#endif
338
339	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
340	ld1     {$rk4s}, [$cc], #16								  @ load rk4
341
342	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
343	ld1     {$rk5s}, [$cc], #16								  @ load rk5
344
345	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
346	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
347
348	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
349	ld1     {$rk6s}, [$cc], #16								  @ load rk6
350
351	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
352	ld1     {$rk7s}, [$cc], #16								  @ load rk7
353
354	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
355	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
356
357	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
358	ld1     {$rk8s}, [$cc], #16								  @ load rk8
359
360	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
361	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
362#ifndef __AARCH64EB__
363	ext     $h2b, $h2b, $h2b, #8
364#endif
365
366	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
367
368	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
369	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
370
371	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
372
373	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
374
375	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
376	ld1     {$rk9s}, [$cc], #16								  @ load rk9
377
378	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
379
380	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
381	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
382
383	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
384	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
385
386	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
387	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
388
389	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
390
391	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
392
393	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
394
395	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
396
397	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
398
399	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
400
401	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
402	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
403
404	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
405
406	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
407
408	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
409
410	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
411
412	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
413
414	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
415
416	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
417
418	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
419
420	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
421
422	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
423
424	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
425
426	aese    $ctr2b, $rk9                                      @ AES block 2 - round 9
427
428	aese    $ctr0b, $rk9                                      @ AES block 0 - round 9
429
430	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
431
432	aese    $ctr1b, $rk9                                      @ AES block 1 - round 9
433
434	aese    $ctr3b, $rk9                                      @ AES block 3 - round 9
435	b.ge    .L128_enc_tail                                    @ handle tail
436
437	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 0 - load plaintext
438#ifdef __AARCH64EB__
439	rev     $input_l0, $input_l0
440	rev     $input_h0, $input_h0
441#endif
442	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 2 - load plaintext
443#ifdef __AARCH64EB__
444	rev     $input_l2, $input_l2
445	rev     $input_h2, $input_h2
446#endif
447	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 1 - load plaintext
448#ifdef __AARCH64EB__
449	rev     $input_l1, $input_l1
450	rev     $input_h1, $input_h1
451#endif
452	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 3 - load plaintext
453#ifdef __AARCH64EB__
454	rev     $input_l3, $input_l3
455	rev     $input_h3, $input_h3
456#endif
457	eor     $input_l0, $input_l0, $rk10_l                     @ AES block 0 - round 10 low
458	eor     $input_h0, $input_h0, $rk10_h                     @ AES block 0 - round 10 high
459
460	eor     $input_l2, $input_l2, $rk10_l                     @ AES block 2 - round 10 low
461	fmov    $ctr_t0d, $input_l0                               @ AES block 0 - mov low
462
463	eor     $input_l1, $input_l1, $rk10_l                     @ AES block 1 - round 10 low
464	eor     $input_h2, $input_h2, $rk10_h                     @ AES block 2 - round 10 high
465	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 0 - mov high
466
467	fmov    $ctr_t1d, $input_l1                               @ AES block 1 - mov low
468	eor     $input_h1, $input_h1, $rk10_h                     @ AES block 1 - round 10 high
469
470	eor     $input_l3, $input_l3, $rk10_l                     @ AES block 3 - round 10 low
471	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 1 - mov high
472
473	fmov    $ctr_t2d, $input_l2                               @ AES block 2 - mov low
474	eor     $input_h3, $input_h3, $rk10_h                     @ AES block 3 - round 10 high
475	rev     $ctr32w, $rctr32w                                 @ CTR block 4
476
477	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 2 - mov high
478	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
479
480	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 0 - result
481	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
482	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
483
484	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
485	rev     $ctr32w, $rctr32w                                 @ CTR block 5
486
487	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 1 - result
488	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
489	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
490
491	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
492	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
493	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
494
495	fmov    $ctr_t3d, $input_l3                               @ AES block 3 - mov low
496	rev     $ctr32w, $rctr32w                                 @ CTR block 6
497	st1     { $res0b}, [$output_ptr], #16                     @ AES block 0 - store result
498
499	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 3 - mov high
500	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
501
502	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
503	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 2 - result
504	st1     { $res1b}, [$output_ptr], #16                     @ AES block 1 - store result
505
506	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 6
507	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
508
509	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 6
510	rev     $ctr32w, $rctr32w                                 @ CTR block 7
511	st1     { $res2b}, [$output_ptr], #16                     @ AES block 2 - store result
512
513	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 7
514
515	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 3 - result
516	st1     { $res3b}, [$output_ptr], #16                     @ AES block 3 - store result
517	b.ge    .L128_enc_prepretail                              @ do prepretail
518
519	.L128_enc_main_loop:                                      @ main loop start
520	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 4k+3 - load plaintext
521#ifdef __AARCH64EB__
522	rev     $input_l3, $input_l3
523	rev     $input_h3, $input_h3
524#endif
525	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
526	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
527
528	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
529	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
530
531	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
532	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
533
534	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
535	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
536	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
537
538	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
539	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
540
541	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
542	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
543
544	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
545	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
546
547	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
548	eor     $input_h3, $input_h3, $rk10_h                     @ AES block 4k+3 - round 10 high
549
550	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
551	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
552	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 4k+4 - load plaintext
553#ifdef __AARCH64EB__
554	rev     $input_l0, $input_l0
555	rev     $input_h0, $input_h0
556#endif
557	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
558	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
559
560	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
561	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
562	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
563
564	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
565	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
566	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
567
568	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
569
570	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
571	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
572
573	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
574
575	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
576	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
577
578	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
579
580	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
581	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
582
583	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
584
585	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
586	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
587
588	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
589	eor     $input_h0, $input_h0, $rk10_h                     @ AES block 4k+4 - round 10 high
590
591	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
592	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
593
594	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
595	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
596
597	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
598	eor     $input_l0, $input_l0, $rk10_l                     @ AES block 4k+4 - round 10 low
599
600	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
601	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
602
603	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
604
605	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
606	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
607
608	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
609
610	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
611	movi    $mod_constant.8b, #0xc2
612
613	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
614	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
615
616	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
617
618	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
619	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
620
621	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
622	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
623
624	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
625	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 4k+5 - load plaintext
626#ifdef __AARCH64EB__
627	rev     $input_l1, $input_l1
628	rev     $input_h1, $input_h1
629#endif
630	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
631	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
632
633	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
634	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 4k+6 - load plaintext
635#ifdef __AARCH64EB__
636	rev     $input_l2, $input_l2
637	rev     $input_h2, $input_h2
638#endif
639	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
640	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
641
642	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
643	eor     $input_l1, $input_l1, $rk10_l                     @ AES block 4k+5 - round 10 low
644
645	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
646	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
647
648	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
649	eor     $input_l3, $input_l3, $rk10_l                     @ AES block 4k+3 - round 10 low
650
651	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
652	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
653
654	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
655	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
656	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
657
658	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
659	fmov    $ctr_t3d, $input_l3                               @ AES block 4k+3 - mov low
660	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
661
662	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
663	fmov    $ctr_t1d, $input_l1                               @ AES block 4k+5 - mov low
664
665	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
666	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
667
668	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
669	eor     $input_h1, $input_h1, $rk10_h                     @ AES block 4k+5 - round 10 high
670
671	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
672	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 4k+5 - mov high
673
674	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
675	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 4k+3 - mov high
676
677	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
678	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
679
680	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
681	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
682
683	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
684	eor     $input_l2, $input_l2, $rk10_l                     @ AES block 4k+6 - round 10 low
685	eor     $input_h2, $input_h2, $rk10_h                     @ AES block 4k+6 - round 10 high
686
687	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
688	fmov    $ctr_t2d, $input_l2                               @ AES block 4k+6 - mov low
689
690	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
691	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 4k+6 - mov high
692
693	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
694	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
695
696	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
697	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
698
699	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
700	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
701	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
702
703	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
704	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 4k+5 - result
705
706	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
707	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
708	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
709
710	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            @ MODULO - mid 64b align with low
711	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
712	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
713
714	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
715	st1     { $res0b}, [$output_ptr], #16                     @ AES block 4k+4 - store result
716	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 4k+6 - result
717	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
718
719	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
720	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
721	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
722	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+10
723
724	eor     $acc_lb, $acc_lb, $acc_hb                         @ MODULO - fold into low
725	st1     { $res1b}, [$output_ptr], #16                     @ AES block 4k+5 - store result
726
727	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+10
728	st1     { $res2b}, [$output_ptr], #16                     @ AES block 4k+6 - store result
729	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+11
730
731	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+11
732	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 4k+3 - result
733
734	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
735	st1     { $res3b}, [$output_ptr], #16                     @ AES block 4k+3 - store result
736	b.lt    .L128_enc_main_loop
737
738	.L128_enc_prepretail:                                     @ PREPRETAIL
739	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
740	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
741	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
742
743	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
744	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
745	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
746
747	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
748	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
749
750	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
751
752	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
753	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
754
755	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
756
757	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
758	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
759
760	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
761	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
762
763	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
764	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
765
766	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
767	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
768
769	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
770
771	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
772	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
773
774	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
775
776	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
777	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
778
779	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
780
781	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
782	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
783
784	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
785
786	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
787	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
788
789	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
790	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
791
792	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
793
794	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
795	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
796
797	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
798
799	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
800
801	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
802	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
803
804	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
805
806	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
807	movi    $mod_constant.8b, #0xc2
808
809	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
810	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
811
812	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
813
814	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
815	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
816
817	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
818
819	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
820	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
821
822	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
823
824	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
825	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
826
827	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
828	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
829
830	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
831
832	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
833	eor     $acc_mb, $acc_mb, $acc_hb                         @ karatsuba tidy up
834
835	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
836
837	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
838	ext     $acc_hb, $acc_hb, $acc_hb, #8
839
840	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
841
842	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
843	eor     $acc_mb, $acc_mb, $acc_lb
844
845	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
846
847	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
848
849	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
850
851	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
852	eor     $acc_mb, $acc_mb, $t1.16b
853
854	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
855
856	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
857
858	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
859
860	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
861	eor     $acc_mb, $acc_mb, $acc_hb
862
863	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
864
865	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
866
867	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
868
869	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
870
871	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
872	ext     $acc_mb, $acc_mb, $acc_mb, #8
873
874	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
875
876	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
877	eor     $acc_lb, $acc_lb, $t1.16b
878
879	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
880
881	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
882
883	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
884
885	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
886
887	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
888	eor     $acc_lb, $acc_lb, $acc_mb
889
890	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
891	.L128_enc_tail:                                           @ TAIL
892
893	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
894	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES block 4k+4 - load plaintext
895#ifdef __AARCH64EB__
896	rev     $input_l0, $input_l0
897	rev     $input_h0, $input_h0
898#endif
899	cmp     $main_end_input_ptr, #48
900
901	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
902	eor     $input_l0, $input_l0, $rk10_l                     @ AES block 4k+4 - round 10 low
903	eor     $input_h0, $input_h0, $rk10_h                     @ AES block 4k+4 - round 10 high
904
905	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
906
907	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
908
909	eor     $res1b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
910
911	b.gt    .L128_enc_blocks_more_than_3
912
913	sub     $rctr32w, $rctr32w, #1
914	movi    $acc_l.8b, #0
915	mov     $ctr3b, $ctr2b
916
917	cmp     $main_end_input_ptr, #32
918	mov     $ctr2b, $ctr1b
919	movi    $acc_h.8b, #0
920
921	movi    $acc_m.8b, #0
922	b.gt    .L128_enc_blocks_more_than_2
923
924	mov     $ctr3b, $ctr1b
925	cmp     $main_end_input_ptr, #16
926
927	sub     $rctr32w, $rctr32w, #1
928	b.gt    .L128_enc_blocks_more_than_1
929
930	sub     $rctr32w, $rctr32w, #1
931	b       .L128_enc_blocks_less_than_1
932	.L128_enc_blocks_more_than_3:                             @ blocks left >  3
933	st1     { $res1b}, [$output_ptr], #16                     @ AES final-3 block  - store result
934
935	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final-2 block - load input low & high
936#ifdef __AARCH64EB__
937	rev     $input_l0, $input_l0
938	rev     $input_h0, $input_h0
939#endif
940	rev64   $res0b, $res1b                                    @ GHASH final-3 block
941
942	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
943	eor     $input_h0, $input_h0, $rk10_h                     @ AES final-2 block - round 10 high
944	eor     $input_l0, $input_l0, $rk10_l                     @ AES final-2 block - round 10 low
945
946	fmov    $res1d, $input_l0                                 @ AES final-2 block - mov low
947
948	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
949	fmov    $res1.d[1], $input_h0                             @ AES final-2 block - mov high
950
951	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH final-3 block - low
952	mov     $rk4d, $res0.d[1]                                 @ GHASH final-3 block - mid
953
954	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH final-3 block - high
955
956	mov     $acc_md, $h34k.d[1]                               @ GHASH final-3 block - mid
957
958	eor     $res1b, $res1b, $ctr1b                            @ AES final-2 block - result
959	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-3 block - mid
960
961	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
962	.L128_enc_blocks_more_than_2:                             @ blocks left >  2
963
964	st1     { $res1b}, [$output_ptr], #16                     @ AES final-2 block - store result
965
966	rev64   $res0b, $res1b                                    @ GHASH final-2 block
967	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final-1 block - load input low & high
968#ifdef __AARCH64EB__
969	rev     $input_l0, $input_l0
970	rev     $input_h0, $input_h0
971#endif
972	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
973
974	eor     $input_l0, $input_l0, $rk10_l                     @ AES final-1 block - round 10 low
975
976	fmov    $res1d, $input_l0                                 @ AES final-1 block - mov low
977	eor     $input_h0, $input_h0, $rk10_h                     @ AES final-1 block - round 10 high
978
979	pmull2  $rk2q1, $res0.2d, $h3.2d                          @ GHASH final-2 block - high
980	fmov    $res1.d[1], $input_h0                             @ AES final-1 block - mov high
981
982	mov     $rk4d, $res0.d[1]                                 @ GHASH final-2 block - mid
983
984	pmull   $rk3q1, $res0.1d, $h3.1d                          @ GHASH final-2 block - low
985
986	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
987
988	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-2 block - mid
989
990	eor     $res1b, $res1b, $ctr2b                            @ AES final-1 block - result
991
992	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-2 block - low
993
994	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                      @ GHASH final-2 block - mid
995
996	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
997
998	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
999	.L128_enc_blocks_more_than_1:                             @ blocks left >  1
1000
1001	st1     { $res1b}, [$output_ptr], #16                     @ AES final-1 block - store result
1002
1003	rev64   $res0b, $res1b                                    @ GHASH final-1 block
1004	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final block - load input low & high
1005#ifdef __AARCH64EB__
1006	rev     $input_l0, $input_l0
1007	rev     $input_h0, $input_h0
1008#endif
1009	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1010
1011	eor     $input_h0, $input_h0, $rk10_h                     @ AES final block - round 10 high
1012	eor     $input_l0, $input_l0, $rk10_l                     @ AES final block - round 10 low
1013
1014	fmov    $res1d, $input_l0                                 @ AES final block - mov low
1015
1016	pmull2  $rk2q1, $res0.2d, $h2.2d                          @ GHASH final-1 block - high
1017	fmov    $res1.d[1], $input_h0                             @ AES final block - mov high
1018
1019	mov     $rk4d, $res0.d[1]                                 @ GHASH final-1 block - mid
1020
1021	pmull   $rk3q1, $res0.1d, $h2.1d                          @ GHASH final-1 block - low
1022
1023	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-1 block - mid
1024
1025	eor     $res1b, $res1b, $ctr3b                            @ AES final block - result
1026
1027	ins     $rk4v.d[1], $rk4v.d[0]                            @ GHASH final-1 block - mid
1028
1029	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                      @ GHASH final-1 block - mid
1030
1031	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-1 block - low
1032
1033	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-1 block - high
1034
1035	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
1036	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1037	.L128_enc_blocks_less_than_1:                             @ blocks left <= 1
1038
1039	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1040	mvn     $rk10_l, xzr                                      @ rk10_l = 0xffffffffffffffff
1041
1042	mvn     $rk10_h, xzr                                      @ rk10_h = 0xffffffffffffffff
1043	sub     $bit_length, $bit_length, #128                    @ bit_length -= 128
1044
1045	neg     $bit_length, $bit_length                          @ bit_length = 128 - #bits in input (in range [1,128])
1046
1047	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1048
1049	lsr     $rk10_h, $rk10_h, $bit_length                     @ rk10_h is mask for top 64b of last block
1050	cmp     $bit_length, #64
1051
1052	csel    $input_l0, $rk10_l, $rk10_h, lt
1053	csel    $input_h0, $rk10_h, xzr, lt
1054
1055	fmov    $ctr0d, $input_l0                                 @ ctr0b is mask for last block
1056
1057	fmov    $ctr0.d[1], $input_h0
1058
1059	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
1060
1061	rev64   $res0b, $res1b                                    @ GHASH final block
1062
1063	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1064
1065	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
1066
1067	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
1068	ld1     { $rk0}, [$output_ptr]                            @ load existing bytes where the possibly partial last block is to be stored
1069
1070	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
1071#ifndef __AARCH64EB__
1072	rev     $ctr32w, $rctr32w
1073#else
1074	mov     $ctr32w, $rctr32w
1075#endif
1076	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
1077
1078	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
1079
1080	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
1081
1082	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
1083
1084	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
1085	movi    $mod_constant.8b, #0xc2
1086
1087	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1088
1089	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1090
1091	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1092
1093	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1094
1095	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1096
1097	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
1098
1099	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
1100
1101	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            @ MODULO - mid 64b align with low
1102
1103	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
1104
1105	bif     $res1b, $rk0, $ctr0b                              @ insert existing bytes in top end of result before storing
1106
1107	eor     $acc_lb, $acc_lb, $acc_hb                         @ MODULO - fold into low
1108	st1     { $res1b}, [$output_ptr]                          @ store all 16B
1109
1110	str     $ctr32w, [$counter, #12]                          @ store the updated counter
1111
1112	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
1113	ext     $acc_lb, $acc_lb, $acc_lb, #8
1114	rev64   $acc_lb, $acc_lb
1115	mov     x0, $len
1116	st1     { $acc_l.16b }, [$current_tag]
1117	ldp     x21, x22, [sp, #16]
1118	ldp     x23, x24, [sp, #32]
1119	ldp     d8, d9, [sp, #48]
1120	ldp     d10, d11, [sp, #64]
1121	ldp     d12, d13, [sp, #80]
1122	ldp     d14, d15, [sp, #96]
1123	ldp     x19, x20, [sp], #112
1124	ret
1125
1126.L128_enc_ret:
1127	mov w0, #0x0
1128	ret
1129.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
1130___
1131
1132#########################################################################################
1133# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
1134#                               size_t len,
1135#                               unsigned char *out,
1136#                               const void *key,
1137#                               unsigned char ivec[16],
1138#                               u64 *Xi);
1139#
1140$code.=<<___;
1141.global aes_gcm_dec_128_kernel
1142.type   aes_gcm_dec_128_kernel,%function
1143.align  4
1144aes_gcm_dec_128_kernel:
1145	cbz     x1, .L128_dec_ret
1146	stp     x19, x20, [sp, #-112]!
1147	mov     x16, x4
1148	mov     x8, x5
1149	stp     x21, x22, [sp, #16]
1150	stp     x23, x24, [sp, #32]
1151	stp     d8, d9, [sp, #48]
1152	stp     d10, d11, [sp, #64]
1153	stp     d12, d13, [sp, #80]
1154	stp     d14, d15, [sp, #96]
1155
1156	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
1157	mov     $len, $main_end_input_ptr
1158	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
1159#ifdef __AARCH64EB__
1160	rev     $ctr96_b64x, $ctr96_b64x
1161	rev     $ctr96_t32x, $ctr96_t32x
1162#endif
1163	ldp     $rk10_l, $rk10_h, [$cc, #160]                     @ load rk10
1164#ifdef __AARCH64EB__
1165	ror     $rk10_h, $rk10_h, 32
1166	ror     $rk10_l, $rk10_l, 32
1167#endif
1168	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
1169	ld1     {$rk0s}, [$cc], #16                                @ load rk0
1170
1171	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1172	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
1173
1174	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
1175#ifndef __AARCH64EB__
1176	ext     $h2b, $h2b, $h2b, #8
1177#endif
1178	lsr     $rctr32x, $ctr96_t32x, #32
1179	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
1180
1181	ld1     {$rk1s}, [$cc], #16                                @ load rk1
1182	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1183	rev     $rctr32w, $rctr32w                                @ rev_ctr32
1184
1185	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
1186	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
1187
1188	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
1189	rev     $ctr32w, $rctr32w                                 @ CTR block 1
1190
1191	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
1192	ld1     {$rk2s}, [$cc], #16                                @ load rk2
1193	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
1194
1195	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
1196	rev     $ctr32w, $rctr32w                                 @ CTR block 2
1197	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
1198
1199	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
1200	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
1201
1202	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
1203	rev     $ctr32w, $rctr32w                                 @ CTR block 3
1204
1205	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
1206	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
1207	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
1208
1209	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
1210	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
1211
1212	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
1213	ld1     {$rk3s}, [$cc], #16                                @ load rk3
1214
1215	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
1216	ld1     {$rk4s}, [$cc], #16                                @ load rk4
1217
1218	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
1219	ld1     {$rk5s}, [$cc], #16                                @ load rk5
1220
1221	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
1222	ld1     {$rk6s}, [$cc], #16                                @ load rk6
1223
1224	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
1225
1226	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
1227
1228	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
1229
1230	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
1231	ld1     { $acc_lb}, [$current_tag]
1232	ext     $acc_lb, $acc_lb, $acc_lb, #8
1233	rev64   $acc_lb, $acc_lb
1234
1235	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
1236	ld1     {$rk7s}, [$cc], #16                                @ load rk7
1237
1238	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
1239
1240	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
1241
1242	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
1243	ld1     {$rk8s}, [$cc], #16                                @ load rk8
1244
1245	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
1246
1247	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
1248
1249	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
1250	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
1251#ifndef __AARCH64EB__
1252	ext     $h3b, $h3b, $h3b, #8
1253#endif
1254	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
1255	ld1     {$rk9s}, [$cc], #16                                @ load rk9
1256
1257	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
1258
1259	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
1260
1261	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
1262
1263	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
1264
1265	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
1266	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
1267#ifndef __AARCH64EB__
1268	ext     $h1b, $h1b, $h1b, #8
1269#endif
1270	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
1271
1272	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
1273
1274	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
1275
1276	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
1277
1278	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
1279	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
1280
1281	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
1282#ifndef __AARCH64EB__
1283	ext     $h4b, $h4b, $h4b, #8
1284#endif
1285	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
1286	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1287
1288	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
1289
1290	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
1291
1292	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
1293	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
1294
1295	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
1296
1297	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
1298	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
1299
1300	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
1301
1302	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
1303
1304	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
1305	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
1306
1307	aese    $ctr2b, $rk9                                      @ AES block 2 - round 9
1308
1309	aese    $ctr3b, $rk9                                      @ AES block 3 - round 9
1310
1311	aese    $ctr0b, $rk9                                      @ AES block 0 - round 9
1312	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
1313
1314	aese    $ctr1b, $rk9                                      @ AES block 1 - round 9
1315	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
1316	b.ge    .L128_dec_tail                                    @ handle tail
1317
1318	ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
1319
1320	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
1321	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 2 - load ciphertext
1322
1323	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
1324	rev64   $res0b, $res0b                                    @ GHASH block 0
1325	rev     $ctr32w, $rctr32w                                 @ CTR block 4
1326
1327	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
1328	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
1329	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 3 - load ciphertext
1330
1331	rev64   $res1b, $res1b                                    @ GHASH block 1
1332	mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
1333
1334	mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
1335
1336	mov     $output_l0, $ctr0.d[0]                            @ AES block 0 - mov low
1337	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
1338
1339	mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
1340
1341	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
1342
1343	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
1344	rev     $ctr32w, $rctr32w                                 @ CTR block 5
1345	eor     $output_l1, $output_l1, $rk10_l                   @ AES block 1 - round 10 low
1346#ifdef __AARCH64EB__
1347	rev     $output_l1, $output_l1
1348#endif
1349	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
1350	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
1351	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
1352
1353	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
1354	rev     $ctr32w, $rctr32w                                 @ CTR block 6
1355	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
1356
1357	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
1358
1359	eor     $output_h1, $output_h1, $rk10_h                   @ AES block 1 - round 10 high
1360#ifdef __AARCH64EB__
1361	rev     $output_h1, $output_h1
1362#endif
1363	eor     $output_l0, $output_l0, $rk10_l                   @ AES block 0 - round 10 low
1364#ifdef __AARCH64EB__
1365	rev     $output_l0, $output_l0
1366#endif
1367	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
1368
1369	eor     $output_h0, $output_h0, $rk10_h                   @ AES block 0 - round 10 high
1370#ifdef __AARCH64EB__
1371	rev     $output_h0, $output_h0
1372#endif
1373	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
1374
1375	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
1376	b.ge    .L128_dec_prepretail                              @ do prepretail
1377
1378	.L128_dec_main_loop:                                      @ main loop start
1379	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
1380	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
1381	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
1382
1383	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
1384	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
1385
1386	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
1387	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
1388
1389	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
1390	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
1391	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
1392
1393	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
1394	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
1395	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
1396
1397	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
1398	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
1399
1400	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
1401	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
1402	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
1403
1404	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
1405	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
1406	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
1407
1408	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
1409	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
1410
1411	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
1412	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
1413
1414	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
1415	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
1416
1417	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
1418
1419	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
1420	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
1421
1422	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
1423	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
1424
1425	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
1426
1427	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
1428	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
1429
1430	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
1431	eor     $output_l3, $output_l3, $rk10_l                   @ AES block 4k+3 - round 10 low
1432#ifdef __AARCH64EB__
1433	rev     $output_l3, $output_l3
1434#endif
1435	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
1436	eor     $output_h2, $output_h2, $rk10_h                   @ AES block 4k+2 - round 10 high
1437#ifdef __AARCH64EB__
1438	rev     $output_h2, $output_h2
1439#endif
1440	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
1441
1442	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
1443	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
1444
1445	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
1446
1447	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
1448	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
1449
1450	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
1451
1452	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
1453	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
1454
1455	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
1456
1457	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
1458	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
1459
1460	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
1461
1462	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
1463	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
1464
1465	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
1466	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
1467
1468	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
1469	eor     $output_h3, $output_h3, $rk10_h                   @ AES block 4k+3 - round 10 high
1470#ifdef __AARCH64EB__
1471	rev     $output_h3, $output_h3
1472#endif
1473	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
1474	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
1475
1476	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
1477	eor     $output_l2, $output_l2, $rk10_l                   @ AES block 4k+2 - round 10 low
1478#ifdef __AARCH64EB__
1479	rev     $output_l2, $output_l2
1480#endif
1481	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
1482	movi    $mod_constant.8b, #0xc2
1483
1484	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
1485	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
1486
1487	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
1488
1489	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
1490	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
1491
1492	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
1493	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
1494
1495	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
1496	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
1497	ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+3 - load ciphertext
1498
1499	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
1500	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
1501
1502	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
1503	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1504
1505	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
1506	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
1507
1508	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
1509	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
1510
1511	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
1512	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1513
1514	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
1515	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
1516
1517	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1518	ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
1519	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1520
1521	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
1522	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
1523
1524	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
1525	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1526
1527	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
1528
1529	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
1530	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
1531
1532	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
1533	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
1534
1535	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
1536	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
1537	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
1538
1539	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
1540	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
1541
1542	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
1543
1544	rev64   $res1b, $res1b                                    @ GHASH block 4k+5
1545	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
1546	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
1547
1548	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
1549	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
1550
1551	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
1552	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
1553
1554	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
1555	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
1556	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
1557
1558	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
1559	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
1560	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
1561
1562	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
1563	eor     $output_h0, $output_h0, $rk10_h                   @ AES block 4k+4 - round 10 high
1564#ifdef __AARCH64EB__
1565	rev     $output_h0, $output_h0
1566#endif
1567	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
1568	mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
1569	eor     $output_l0, $output_l0, $rk10_l                   @ AES block 4k+4 - round 10 low
1570#ifdef __AARCH64EB__
1571	rev     $output_l0, $output_l0
1572#endif
1573	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
1574	mov     $output_l1, $ctr1.d[0]                            @ AES block 4k+5 - mov low
1575	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
1576
1577	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
1578	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
1579	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
1580
1581	rev64   $res0b, $res0b                                    @ GHASH block 4k+4
1582	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
1583	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
1584
1585	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
1586	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
1587
1588	eor     $output_h1, $output_h1, $rk10_h                   @ AES block 4k+5 - round 10 high
1589#ifdef __AARCH64EB__
1590	rev     $output_h1, $output_h1
1591#endif
1592	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
1593
1594	eor     $output_l1, $output_l1, $rk10_l                   @ AES block 4k+5 - round 10 low
1595#ifdef __AARCH64EB__
1596	rev     $output_l1, $output_l1
1597#endif
1598	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
1599
1600	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
1601	b.lt    L128_dec_main_loop
1602
1603	.L128_dec_prepretail:                                     @ PREPRETAIL
1604	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
1605	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
1606	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
1607
1608	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
1609	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
1610
1611	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
1612	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
1613
1614	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
1615	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
1616	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
1617
1618	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
1619	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
1620
1621	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
1622	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
1623	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
1624
1625	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
1626	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
1627	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
1628
1629	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
1630	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
1631
1632	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
1633	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
1634
1635	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
1636	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
1637	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
1638
1639	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
1640	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
1641
1642	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
1643	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
1644
1645	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
1646
1647	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
1648	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
1649
1650	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
1651
1652	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
1653	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
1654
1655	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
1656
1657	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
1658	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
1659
1660	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
1661
1662	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
1663	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
1664
1665	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
1666
1667	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
1668
1669	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
1670	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
1671
1672	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
1673	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
1674
1675	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
1676
1677	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
1678	movi    $mod_constant.8b, #0xc2
1679
1680	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
1681	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
1682
1683	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
1684
1685	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
1686	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
1687
1688	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
1689	eor     $output_l3, $output_l3, $rk10_l                   @ AES block 4k+3 - round 10 low
1690#ifdef __AARCH64EB__
1691	rev     $output_l3, $output_l3
1692#endif
1693	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
1694	eor     $output_l2, $output_l2, $rk10_l                   @ AES block 4k+2 - round 10 low
1695#ifdef __AARCH64EB__
1696	rev     $output_l2, $output_l2
1697#endif
1698	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
1699
1700	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
1701
1702	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
1703	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1704
1705	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
1706
1707	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
1708	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
1709
1710	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
1711
1712	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
1713	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1714
1715	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
1716
1717	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
1718
1719	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
1720
1721	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
1722	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1723
1724	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1725
1726	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
1727	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1728
1729	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
1730
1731	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
1732	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
1733
1734	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
1735
1736	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
1737
1738	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
1739
1740	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
1741	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
1742
1743	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
1744
1745	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
1746
1747	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
1748
1749	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
1750	eor     $output_h3, $output_h3, $rk10_h                   @ AES block 4k+3 - round 10 high
1751#ifdef __AARCH64EB__
1752	rev     $output_h3, $output_h3
1753#endif
1754	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
1755	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
1756
1757	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
1758
1759	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
1760	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
1761
1762	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
1763
1764	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
1765	eor     $output_h2, $output_h2, $rk10_h                   @ AES block 4k+2 - round 10 high
1766#ifdef __AARCH64EB__
1767	rev     $output_h2, $output_h2
1768#endif
1769	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
1770	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
1771
1772	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
1773	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
1774	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
1775
1776	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
1777	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
1778	.L128_dec_tail:                                           @ TAIL
1779
1780	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
1781	ld1     { $res1b}, [$input_ptr], #16                      @ AES block 4k+4 - load ciphertext
1782
1783	eor     $ctr0b, $res1b, $ctr0b                            @ AES block 4k+4 - result
1784
1785	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
1786
1787	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
1788
1789	cmp     $main_end_input_ptr, #48
1790
1791	eor     $output_h0, $output_h0, $rk10_h                   @ AES block 4k+4 - round 10 high
1792#ifdef __AARCH64EB__
1793	rev     $output_h0, $output_h0
1794#endif
1795	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
1796	eor     $output_l0, $output_l0, $rk10_l                   @ AES block 4k+4 - round 10 low
1797#ifdef __AARCH64EB__
1798	rev     $output_l0, $output_l0
1799#endif
1800	b.gt    .L128_dec_blocks_more_than_3
1801
1802	mov     $ctr3b, $ctr2b
1803	sub     $rctr32w, $rctr32w, #1
1804	movi    $acc_l.8b, #0
1805
1806	movi    $acc_h.8b, #0
1807	mov     $ctr2b, $ctr1b
1808
1809	movi    $acc_m.8b, #0
1810	cmp     $main_end_input_ptr, #32
1811	b.gt     .L128_dec_blocks_more_than_2
1812
1813	cmp     $main_end_input_ptr, #16
1814
1815	mov     $ctr3b, $ctr1b
1816	sub     $rctr32w, $rctr32w, #1
1817	b.gt    .L128_dec_blocks_more_than_1
1818
1819	sub     $rctr32w, $rctr32w, #1
1820	b       .L128_dec_blocks_less_than_1
1821	.L128_dec_blocks_more_than_3:                             @ blocks left >  3
1822	rev64   $res0b, $res1b                                    @ GHASH final-3 block
1823	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-2 block - load ciphertext
1824
1825	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1826
1827	mov     $acc_md, $h34k.d[1]                               @ GHASH final-3 block - mid
1828	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-3 block  - store result
1829	eor     $ctr0b, $res1b, $ctr1b                            @ AES final-2 block - result
1830
1831	mov     $rk4d, $res0.d[1]                                 @ GHASH final-3 block - mid
1832	mov     $output_h0, $ctr0.d[1]                            @ AES final-2 block - mov high
1833
1834	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH final-3 block - low
1835	mov     $output_l0, $ctr0.d[0]                            @ AES final-2 block - mov low
1836
1837	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH final-3 block - high
1838
1839	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-3 block - mid
1840
1841	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1842	eor     $output_h0, $output_h0, $rk10_h                   @ AES final-2 block - round 10 high
1843#ifdef __AARCH64EB__
1844	rev     $output_h0, $output_h0
1845#endif
1846	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
1847	eor     $output_l0, $output_l0, $rk10_l                   @ AES final-2 block - round 10 low
1848#ifdef __AARCH64EB__
1849	rev     $output_l0, $output_l0
1850#endif
1851	.L128_dec_blocks_more_than_2:                             @ blocks left >  2
1852
1853	rev64   $res0b, $res1b                                    @ GHASH final-2 block
1854	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-1 block - load ciphertext
1855
1856	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1857
1858	eor     $ctr0b, $res1b, $ctr2b                            @ AES final-1 block - result
1859	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-2 block  - store result
1860
1861	mov     $rk4d, $res0.d[1]                                 @ GHASH final-2 block - mid
1862
1863	pmull   $rk3q1, $res0.1d, $h3.1d                          @ GHASH final-2 block - low
1864
1865	pmull2  $rk2q1, $res0.2d, $h3.2d                          @ GHASH final-2 block - high
1866	mov     $output_l0, $ctr0.d[0]                            @ AES final-1 block - mov low
1867
1868	mov     $output_h0, $ctr0.d[1]                            @ AES final-1 block - mov high
1869	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-2 block - mid
1870
1871	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1872
1873	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                      @ GHASH final-2 block - mid
1874
1875	eor     $output_l0, $output_l0, $rk10_l                   @ AES final-1 block - round 10 low
1876#ifdef __AARCH64EB__
1877	rev     $output_l0, $output_l0
1878#endif
1879	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-2 block - low
1880
1881	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
1882
1883	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
1884	eor     $output_h0, $output_h0, $rk10_h                   @ AES final-1 block - round 10 high
1885#ifdef __AARCH64EB__
1886	rev     $output_h0, $output_h0
1887#endif
1888	.L128_dec_blocks_more_than_1:                             @ blocks left >  1
1889
1890	rev64   $res0b, $res1b                                    @ GHASH final-1 block
1891
1892	ld1     { $res1b}, [$input_ptr], #16                      @ AES final block - load ciphertext
1893	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1894
1895	mov     $rk4d, $res0.d[1]                                 @ GHASH final-1 block - mid
1896
1897	eor     $ctr0b, $res1b, $ctr3b                            @ AES final block - result
1898
1899	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-1 block - mid
1900
1901	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-1 block  - store result
1902	mov     $output_l0, $ctr0.d[0]                            @ AES final block - mov low
1903
1904	mov     $output_h0, $ctr0.d[1]                            @ AES final block - mov high
1905	ins     $rk4v.d[1], $rk4v.d[0]                            @ GHASH final-1 block - mid
1906
1907	pmull   $rk3q1, $res0.1d, $h2.1d                          @ GHASH final-1 block - low
1908
1909	pmull2  $rk2q1, $res0.2d, $h2.2d                          @ GHASH final-1 block - high
1910
1911	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                      @ GHASH final-1 block - mid
1912	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1913
1914	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-1 block - low
1915
1916	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-1 block - high
1917	eor     $output_h0, $output_h0, $rk10_h                   @ AES final block - round 10 high
1918#ifdef __AARCH64EB__
1919	rev     $output_h0, $output_h0
1920#endif
1921	eor     $output_l0, $output_l0, $rk10_l                   @ AES final block - round 10 low
1922#ifdef __AARCH64EB__
1923	rev     $output_l0, $output_l0
1924#endif
1925	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
1926	.L128_dec_blocks_less_than_1:                                            @ blocks left <= 1
1927
1928	mvn     $rk10_h, xzr                                      @ rk10_h = 0xffffffffffffffff
1929	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1930
1931	mvn     $rk10_l, xzr                                      @ rk10_l = 0xffffffffffffffff
1932	sub     $bit_length, $bit_length, #128                    @ bit_length -= 128
1933
1934	neg     $bit_length, $bit_length                          @ bit_length = 128 - #bits in input (in range [1,128])
1935
1936	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1937
1938	lsr     $rk10_h, $rk10_h, $bit_length                     @ rk10_h is mask for top 64b of last block
1939	cmp     $bit_length, #64
1940
1941	csel    $ctr96_b64x, $rk10_h, xzr, lt
1942	csel    $ctr32x, $rk10_l, $rk10_h, lt
1943
1944	fmov    $ctr0d, $ctr32x                                   @ ctr0b is mask for last block
1945
1946	mov     $ctr0.d[1], $ctr96_b64x
1947
1948	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
1949
1950	rev64   $res0b, $res1b                                    @ GHASH final block
1951
1952	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1953
1954	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
1955
1956	and     $output_h0, $output_h0, $ctr96_b64x
1957
1958	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
1959	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
1960
1961	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
1962	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
1963
1964	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
1965
1966	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
1967	bic     $end_input_ptr, $end_input_ptr, $ctr32x           @ mask out low existing bytes
1968	and     $output_l0, $output_l0, $ctr32x
1969
1970#ifndef __AARCH64EB__
1971	rev     $ctr32w, $rctr32w
1972#else
1973	mov     $ctr32w, $rctr32w
1974#endif
1975
1976	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
1977	movi    $mod_constant.8b, #0xc2
1978
1979	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
1980
1981	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x   @ mask out high existing bytes
1982	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1983
1984	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1985
1986	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1987
1988	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1989
1990	orr     $output_l0, $output_l0, $end_input_ptr
1991	str     $ctr32w, [$counter, #12]                          @ store the updated counter
1992
1993	orr     $output_h0, $output_h0, $main_end_input_ptr
1994	stp     $output_l0, $output_h0, [$output_ptr]
1995	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1996
1997	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
1998
1999	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
2000
2001	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
2002	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
2003
2004	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
2005
2006	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
2007	ext     $acc_lb, $acc_lb, $acc_lb, #8
2008	rev64   $acc_lb, $acc_lb
2009	mov     x0, $len
2010	st1     { $acc_l.16b }, [$current_tag]
2011
2012	ldp     x21, x22, [sp, #16]
2013	ldp     x23, x24, [sp, #32]
2014	ldp     d8, d9, [sp, #48]
2015	ldp     d10, d11, [sp, #64]
2016	ldp     d12, d13, [sp, #80]
2017	ldp     d14, d15, [sp, #96]
2018	ldp     x19, x20, [sp], #112
2019	ret
2020
2021	.L128_dec_ret:
2022	mov w0, #0x0
2023	ret
2024.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
2025___
2026}
2027
2028{
2029my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
2030my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
2031my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
2032my ($output_l0,$output_h0)=map("x$_",(6..7));
2033
2034my $ctr32w="w9";
2035my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
2036my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
2037
2038my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
2039my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
2040my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
2041my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
2042
2043my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
2044my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
2045my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
2046
2047my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
2048my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
2049my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
2050
2051my $t0="v8";
2052my $t0d="d8";
2053my $t3="v4";
2054my $t3d="d4";
2055
2056my ($t1,$t2)=map("v$_",(30..31));
2057my ($t1d,$t2d)=map("d$_",(30..31));
2058
2059my $t4="v30";
2060my $t4d="d30";
2061my $t5="v8";
2062my $t5d="d8";
2063my $t6="v31";
2064my $t6d="d31";
2065
2066my $t7="v5";
2067my $t7d="d5";
2068my $t8="v6";
2069my $t8d="d6";
2070my $t9="v30";
2071my $t9d="d30";
2072
2073my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
2074my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
2075my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
2076
2077my $mod_constantd="d8";
2078my $mod_constant="v8";
2079my $mod_t="v31";
2080
2081my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
2082my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
2083my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29));
2084my $rk2q1="v20.1q";
2085my $rk3q1="v21.1q";
2086my $rk4v="v22";
2087my $rk4d="d22";
2088
2089#########################################################################################
2090# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
2091#                               size_t len,
2092#                               unsigned char *out,
2093#                               const void *key,
2094#                               unsigned char ivec[16],
2095#                               u64 *Xi);
2096#
2097$code.=<<___;
2098.global aes_gcm_enc_192_kernel
2099.type   aes_gcm_enc_192_kernel,%function
2100.align  4
2101aes_gcm_enc_192_kernel:
2102	cbz     x1, .L192_enc_ret
2103	stp     x19, x20, [sp, #-112]!
2104	mov     x16, x4
2105	mov     x8, x5
2106	stp     x21, x22, [sp, #16]
2107	stp     x23, x24, [sp, #32]
2108	stp     d8, d9, [sp, #48]
2109	stp     d10, d11, [sp, #64]
2110	stp     d12, d13, [sp, #80]
2111	stp     d14, d15, [sp, #96]
2112
2113	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]             @ ctr96_b64, ctr96_t32
2114#ifdef __AARCH64EB__
2115	rev     $ctr96_b64x, $ctr96_b64x
2116	rev     $ctr96_t32x, $ctr96_t32x
2117#endif
2118	ldp     $rk12_l, $rk12_h, [$cc, #192]                     @ load rk12
2119#ifdef __AARCH64EB__
2120	ror     $rk12_l, $rk12_l, #32
2121	ror     $rk12_h, $rk12_h, #32
2122#endif
2123	ld1     {$rk0s}, [$cc], #16	                             @ load rk0
2124
2125	ld1     {$rk1s}, [$cc], #16	                             @ load rk1
2126
2127	ld1     {$rk2s}, [$cc], #16	                             @ load rk2
2128
2129	lsr     $rctr32x, $ctr96_t32x, #32
2130	ld1     {$rk3s}, [$cc], #16	                             @ load rk3
2131	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2132
2133	ld1     {$rk4s}, [$cc], #16	                             @ load rk4
2134	rev     $rctr32w, $rctr32w                               @ rev_ctr32
2135
2136	add     $rctr32w, $rctr32w, #1                           @ increment rev_ctr32
2137	fmov    $ctr3d, $ctr96_b64x                              @ CTR block 3
2138
2139	rev     $ctr32w, $rctr32w                                @ CTR block 1
2140	add     $rctr32w, $rctr32w, #1                           @ CTR block 1
2141	fmov    $ctr1d, $ctr96_b64x                              @ CTR block 1
2142
2143	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 1
2144	ld1     { $ctr0b}, [$counter]                            @ special case vector load initial counter so we can start first AES block as quickly as possible
2145
2146	fmov    $ctr1.d[1], $ctr32x                              @ CTR block 1
2147	rev     $ctr32w, $rctr32w                                @ CTR block 2
2148	add     $rctr32w, $rctr32w, #1                           @ CTR block 2
2149
2150	fmov    $ctr2d, $ctr96_b64x                              @ CTR block 2
2151	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 2
2152
2153	fmov    $ctr2.d[1], $ctr32x                              @ CTR block 2
2154	rev     $ctr32w, $rctr32w                                @ CTR block 3
2155
2156	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 3
2157	ld1     {$rk5s}, [$cc], #16	                             @ load rk5
2158
2159	fmov    $ctr3.d[1], $ctr32x                              @ CTR block 3
2160
2161	ld1     {$rk6s}, [$cc], #16	                             @ load rk6
2162
2163	ld1     {$rk7s}, [$cc], #16	                             @ load rk7
2164
2165	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 0
2166	ld1     { $acc_lb}, [$current_tag]
2167	ext     $acc_lb, $acc_lb, $acc_lb, #8
2168	rev64   $acc_lb, $acc_lb
2169
2170	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 0
2171	ld1     {$rk8s}, [$cc], #16	                             @ load rk8
2172
2173	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 0
2174	ldr     $h4q, [$current_tag, #112]                       @ load h4l | h4h
2175#ifndef __AARCH64EB__
2176	ext     $h4b, $h4b, $h4b, #8
2177#endif
2178	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 0
2179	ld1     {$rk9s}, [$cc], #16	                             @ load rk9
2180
2181	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 1
2182	ld1     {$rk10s}, [$cc], #16	                         @ load rk10
2183
2184	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 1
2185	ldr     $h1q, [$current_tag, #32]                        @ load h1l | h1h
2186#ifndef __AARCH64EB__
2187	ext     $h1b, $h1b, $h1b, #8
2188#endif
2189	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 1
2190	ld1     {$rk11s}, [$cc], #16	                         @ load rk11
2191
2192	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 1
2193	ldr     $h3q, [$current_tag, #80]                        @ load h3l | h3h
2194#ifndef __AARCH64EB__
2195	ext     $h3b, $h3b, $h3b, #8
2196#endif
2197	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 2
2198
2199	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 2
2200
2201	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 2
2202
2203	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 3
2204	trn1    $acc_h.2d, $h3.2d,    $h4.2d                     @ h4h | h3h
2205
2206	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 3
2207
2208	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 2
2209	trn2    $h34k.2d,  $h3.2d,    $h4.2d                     @ h4l | h3l
2210
2211	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 4
2212
2213	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 3
2214
2215	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 3
2216
2217	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 5
2218
2219	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 4
2220
2221	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 4
2222
2223	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 6
2224
2225	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 4
2226
2227	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 5
2228
2229	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 5
2230
2231	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 5
2232
2233	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 6
2234	ldr     $h2q, [$current_tag, #64]                        @ load h2l | h2h
2235#ifndef __AARCH64EB__
2236	ext     $h2b, $h2b, $h2b, #8
2237#endif
2238	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 6
2239
2240	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 6
2241
2242	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 7
2243
2244	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 7
2245	trn2    $h12k.2d,  $h1.2d,    $h2.2d                     @ h2l | h1l
2246
2247	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 7
2248
2249	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 8
2250
2251	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 7
2252	trn1    $t0.2d,    $h1.2d,    $h2.2d                     @ h2h | h1h
2253
2254	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 8
2255
2256	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 8
2257
2258	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 8
2259
2260	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 9
2261
2262	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 9
2263
2264	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 9
2265
2266	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 9
2267
2268	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 10
2269
2270	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 10
2271
2272	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 10
2273	lsr     $main_end_input_ptr, $bit_length, #3             @ byte_len
2274	mov     $len, $main_end_input_ptr
2275
2276	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 10
2277	sub     $main_end_input_ptr, $main_end_input_ptr, #1     @ byte_len - 1
2278
2279	eor     $h12k.16b, $h12k.16b, $t0.16b                    @ h2k | h1k
2280	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2281
2282	eor     $h34k.16b, $h34k.16b, $acc_h.16b                 @ h4k | h3k
2283
2284	aese    $ctr2b, $rk11                                    @ AES block 2 - round 11
2285	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3  @ end_input_ptr
2286	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2287
2288	aese    $ctr1b, $rk11                                    @ AES block 1 - round 11
2289	cmp     $input_ptr, $main_end_input_ptr                  @ check if we have <= 4 blocks
2290
2291	aese    $ctr0b, $rk11                                    @ AES block 0 - round 11
2292	add     $rctr32w, $rctr32w, #1                           @ CTR block 3
2293
2294	aese    $ctr3b, $rk11                                    @ AES block 3 - round 11
2295	b.ge    .L192_enc_tail                                   @ handle tail
2296
2297	rev     $ctr32w, $rctr32w                                @ CTR block 4
2298	ldp     $input_l0, $input_h0, [$input_ptr, #0]           @ AES block 0 - load plaintext
2299#ifdef __AARCH64EB__
2300	rev     $input_l0, $input_l0
2301	rev     $input_h0, $input_h0
2302#endif
2303	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4
2304	ldp     $input_l2, $input_h2, [$input_ptr, #32]          @ AES block 2 - load plaintext
2305#ifdef __AARCH64EB__
2306	rev     $input_l2, $input_l2
2307	rev     $input_h2, $input_h2
2308#endif
2309	ldp     $input_l3, $input_h3, [$input_ptr, #48]          @ AES block 3 - load plaintext
2310#ifdef __AARCH64EB__
2311	rev     $input_l3, $input_l3
2312	rev     $input_h3, $input_h3
2313#endif
2314	ldp     $input_l1, $input_h1, [$input_ptr, #16]          @ AES block 1 - load plaintext
2315#ifdef __AARCH64EB__
2316	rev     $input_l1, $input_l1
2317	rev     $input_h1, $input_h1
2318#endif
2319	add     $input_ptr, $input_ptr, #64                      @ AES input_ptr update
2320	cmp     $input_ptr, $main_end_input_ptr                  @ check if we have <= 8 blocks
2321
2322	eor     $input_l0, $input_l0, $rk12_l                    @ AES block 0 - round 12 low
2323
2324	eor     $input_h0, $input_h0, $rk12_h                    @ AES block 0 - round 12 high
2325	eor     $input_h2, $input_h2, $rk12_h                    @ AES block 2 - round 12 high
2326	fmov    $ctr_t0d, $input_l0                              @ AES block 0 - mov low
2327
2328	eor     $input_h3, $input_h3, $rk12_h                    @ AES block 3 - round 12 high
2329	fmov    $ctr_t0.d[1], $input_h0                          @ AES block 0 - mov high
2330
2331	eor     $input_l2, $input_l2, $rk12_l                    @ AES block 2 - round 12 low
2332	eor     $input_l1, $input_l1, $rk12_l                    @ AES block 1 - round 12 low
2333
2334	fmov    $ctr_t1d, $input_l1                              @ AES block 1 - mov low
2335	eor     $input_h1, $input_h1, $rk12_h                    @ AES block 1 - round 12 high
2336
2337	fmov    $ctr_t1.d[1], $input_h1                          @ AES block 1 - mov high
2338
2339	eor     $input_l3, $input_l3, $rk12_l                    @ AES block 3 - round 12 low
2340	fmov    $ctr_t2d, $input_l2                              @ AES block 2 - mov low
2341
2342	add     $rctr32w, $rctr32w, #1                           @ CTR block 4
2343	eor     $res0b, $ctr_t0b, $ctr0b                         @ AES block 0 - result
2344	fmov    $ctr0d, $ctr96_b64x                              @ CTR block 4
2345
2346	fmov    $ctr0.d[1], $ctr32x                              @ CTR block 4
2347	rev     $ctr32w, $rctr32w                                @ CTR block 5
2348
2349	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 5
2350	add     $rctr32w, $rctr32w, #1                           @ CTR block 5
2351
2352	fmov    $ctr_t3d, $input_l3                              @ AES block 3 - mov low
2353	st1     { $res0b}, [$output_ptr], #16                    @ AES block 0 - store result
2354
2355	fmov    $ctr_t2.d[1], $input_h2                          @ AES block 2 - mov high
2356
2357	eor     $res1b, $ctr_t1b, $ctr1b                         @ AES block 1 - result
2358	fmov    $ctr1d, $ctr96_b64x                              @ CTR block 5
2359	st1     { $res1b}, [$output_ptr], #16                    @ AES block 1 - store result
2360
2361	fmov    $ctr_t3.d[1], $input_h3                          @ AES block 3 - mov high
2362
2363	fmov    $ctr1.d[1], $ctr32x                              @ CTR block 5
2364	rev     $ctr32w, $rctr32w                                @ CTR block 6
2365
2366	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 6
2367
2368	add     $rctr32w, $rctr32w, #1                           @ CTR block 6
2369	eor     $res2b, $ctr_t2b, $ctr2b                         @ AES block 2 - result
2370	fmov    $ctr2d, $ctr96_b64x                              @ CTR block 6
2371
2372	fmov    $ctr2.d[1], $ctr32x                              @ CTR block 6
2373	rev     $ctr32w, $rctr32w                                @ CTR block 7
2374
2375	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 7
2376	st1     { $res2b}, [$output_ptr], #16                    @ AES block 2 - store result
2377
2378	eor     $res3b, $ctr_t3b, $ctr3b                         @ AES block 3 - result
2379	st1     { $res3b}, [$output_ptr], #16                    @ AES block 3 - store result
2380	b.ge    .L192_enc_prepretail                             @ do prepretail
2381
2382	.L192_enc_main_loop:                                     @ main loop start
2383	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 0
2384	rev64   $res1b, $res1b                                   @ GHASH block 4k+1 (t0 and t1 free)
2385
2386	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 0
2387	ldp     $input_l1, $input_h1, [$input_ptr, #16]          @ AES block 4k+5 - load plaintext
2388#ifdef __AARCH64EB__
2389	rev     $input_l1, $input_l1
2390	rev     $input_h1, $input_h1
2391#endif
2392	ext     $acc_lb, $acc_lb, $acc_lb, #8                    @ PRE 0
2393	fmov    $ctr3d, $ctr96_b64x                              @ CTR block 4k+3
2394	rev64   $res0b, $res0b                                   @ GHASH block 4k (only t0 is free)
2395
2396	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 1
2397	fmov    $ctr3.d[1], $ctr32x                              @ CTR block 4k+3
2398
2399	pmull2  $t1.1q, $res1.2d, $h3.2d                         @ GHASH block 4k+1 - high
2400	rev64   $res3b, $res3b                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2401	ldp     $input_l2, $input_h2, [$input_ptr, #32]          @ AES block 4k+6 - load plaintext
2402#ifdef __AARCH64EB__
2403	rev     $input_l2, $input_l2
2404	rev     $input_h2, $input_h2
2405#endif
2406	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 0
2407	ldp     $input_l3, $input_h3, [$input_ptr, #48]          @ AES block 4k+3 - load plaintext
2408#ifdef __AARCH64EB__
2409	rev     $input_l3, $input_l3
2410	rev     $input_h3, $input_h3
2411#endif
2412	pmull   $t2.1q, $res1.1d, $h3.1d                         @ GHASH block 4k+1 - low
2413	eor     $res0b, $res0b, $acc_lb                          @ PRE 1
2414
2415	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 1
2416
2417	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 1
2418	rev64   $res2b, $res2b                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
2419
2420	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 0
2421	eor     $input_h3, $input_h3, $rk12_h                    @ AES block 4k+3 - round 12 high
2422
2423	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH block 4k - low
2424	mov     $t0d, $res0.d[1]                                 @ GHASH block 4k - mid
2425
2426	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 2
2427
2428	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 1
2429	eor     $input_l2, $input_l2, $rk12_l                    @ AES block 4k+6 - round 12 low
2430
2431	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH block 4k - mid
2432	eor     $acc_lb, $acc_lb, $t2.16b                        @ GHASH block 4k+1 - low
2433
2434	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 3
2435	eor     $input_l1, $input_l1, $rk12_l                    @ AES block 4k+5 - round 12 low
2436
2437	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 2
2438	mov     $t6d, $res2.d[1]                                 @ GHASH block 4k+2 - mid
2439
2440	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH block 4k - high
2441	mov     $t3d, $res1.d[1]                                 @ GHASH block 4k+1 - mid
2442
2443	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 2
2444
2445	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 3
2446
2447	mov     $acc_md, $h34k.d[1]                              @ GHASH block 4k - mid
2448	eor     $acc_hb, $acc_hb, $t1.16b                        @ GHASH block 4k+1 - high
2449
2450	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 2
2451	eor     $t6.8b, $t6.8b, $res2.8b                         @ GHASH block 4k+2 - mid
2452
2453	pmull2  $t4.1q, $res2.2d, $h2.2d                         @ GHASH block 4k+2 - high
2454
2455	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 4
2456	eor     $t3.8b, $t3.8b, $res1.8b                         @ GHASH block 4k+1 - mid
2457
2458	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 3
2459
2460	pmull2  $t7.1q, $res3.2d, $h1.2d                         @ GHASH block 4k+3 - high
2461	eor     $input_h1, $input_h1, $rk12_h                    @ AES block 4k+5 - round 12 high
2462	ins     $t6.d[1], $t6.d[0]                               @ GHASH block 4k+2 - mid
2463
2464	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 5
2465	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+3
2466
2467	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 4
2468	eor     $acc_hb, $acc_hb, $t4.16b                        @ GHASH block 4k+2 - high
2469
2470	pmull   $t3.1q, $t3.1d, $h34k.1d                         @ GHASH block 4k+1 - mid
2471	eor     $input_h2, $input_h2, $rk12_h                    @ AES block 4k+6 - round 12 high
2472
2473	pmull2  $t6.1q, $t6.2d, $h12k.2d                         @ GHASH block 4k+2 - mid
2474	eor     $input_l3, $input_l3, $rk12_l                    @ AES block 4k+3 - round 12 low
2475	mov     $t9d, $res3.d[1]                                 @ GHASH block 4k+3 - mid
2476
2477	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                     @ GHASH block 4k - mid
2478	rev     $ctr32w, $rctr32w                                @ CTR block 4k+8
2479
2480	pmull   $t5.1q, $res2.1d, $h2.1d                         @ GHASH block 4k+2 - low
2481	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+8
2482
2483	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 3
2484	eor     $t9.8b, $t9.8b, $res3.8b                         @ GHASH block 4k+3 - mid
2485
2486	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 4
2487	ldp     $input_l0, $input_h0, [$input_ptr, #0]           @ AES block 4k+4 - load plaintext
2488#ifdef __AARCH64EB__
2489	rev     $input_l0, $input_l0
2490	rev     $input_h0, $input_h0
2491#endif
2492	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 6
2493	eor     $acc_lb, $acc_lb, $t5.16b                        @ GHASH block 4k+2 - low
2494
2495	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 4
2496	add     $input_ptr, $input_ptr, #64                      @ AES input_ptr update
2497
2498	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 5
2499	movi    $mod_constant.8b, #0xc2
2500
2501	pmull   $t8.1q, $res3.1d, $h1.1d                         @ GHASH block 4k+3 - low
2502	eor     $input_h0, $input_h0, $rk12_h                    @ AES block 4k+4 - round 12 high
2503	eor     $acc_mb, $acc_mb, $t3.16b                        @ GHASH block 4k+1 - mid
2504
2505	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 5
2506	eor     $input_l0, $input_l0, $rk12_l                    @ AES block 4k+4 - round 12 low
2507
2508	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 6
2509	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
2510
2511	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 5
2512	eor     $acc_hb, $acc_hb, $t7.16b                        @ GHASH block 4k+3 - high
2513
2514	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 7
2515	fmov    $ctr_t1d, $input_l1                              @ AES block 4k+5 - mov low
2516
2517	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 7
2518	eor     $acc_mb, $acc_mb, $t6.16b                        @ GHASH block 4k+2 - mid
2519
2520	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 6
2521	fmov    $ctr_t1.d[1], $input_h1                          @ AES block 4k+5 - mov high
2522
2523	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 8
2524	eor     $acc_lb, $acc_lb, $t8.16b                        @ GHASH block 4k+3 - low
2525
2526	pmull   $t9.1q, $t9.1d, $h12k.1d                         @ GHASH block 4k+3 - mid
2527	cmp     $input_ptr, $main_end_input_ptr                  @ LOOP CONTROL
2528	fmov    $ctr_t0d, $input_l0                              @ AES block 4k+4 - mov low
2529
2530	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 6
2531	fmov    $ctr_t0.d[1], $input_h0                          @ AES block 4k+4 - mov high
2532
2533	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 8
2534	fmov    $ctr_t3d, $input_l3                              @ AES block 4k+3 - mov low
2535
2536	eor     $acc_mb, $acc_mb, $t9.16b                        @ GHASH block 4k+3 - mid
2537	eor     $t9.16b, $acc_lb, $acc_hb                        @ MODULO - karatsuba tidy up
2538	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+8
2539
2540	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 7
2541	fmov    $ctr_t3.d[1], $input_h3                          @ AES block 4k+3 - mov high
2542
2543	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           @ MODULO - top 64b align with mid
2544	ext     $acc_hb, $acc_hb, $acc_hb, #8                    @ MODULO - other top alignment
2545	fmov    $ctr_t2d, $input_l2                              @ AES block 4k+6 - mov low
2546
2547	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 7
2548
2549	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 9
2550	eor     $acc_mb, $acc_mb, $t9.16b                        @ MODULO - karatsuba tidy up
2551
2552	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 8
2553
2554	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 8
2555
2556	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 9
2557
2558	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 10
2559	eor     $acc_mb, $acc_mb, $mod_t.16b                     @ MODULO - fold into mid
2560
2561	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 9
2562
2563	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 9
2564
2565	aese    $ctr0b, $rk11                                    @ AES block 4k+4 - round 11
2566
2567	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 10
2568	eor     $acc_mb, $acc_mb, $acc_hb                        @ MODULO - fold into mid
2569
2570	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 10
2571
2572	eor     $res0b, $ctr_t0b, $ctr0b                         @ AES block 4k+4 - result
2573	fmov    $ctr0d, $ctr96_b64x                              @ CTR block 4k+8
2574
2575	aese    $ctr1b, $rk11                                    @ AES block 4k+5 - round 11
2576	fmov    $ctr0.d[1], $ctr32x                              @ CTR block 4k+8
2577	rev     $ctr32w, $rctr32w                                @ CTR block 4k+9
2578
2579	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           @ MODULO - mid 64b align with low
2580	fmov    $ctr_t2.d[1], $input_h2                          @ AES block 4k+6 - mov high
2581	st1     { $res0b}, [$output_ptr], #16                    @ AES block 4k+4 - store result
2582
2583	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 10
2584	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+9
2585
2586	eor     $res1b, $ctr_t1b, $ctr1b                         @ AES block 4k+5 - result
2587	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+9
2588	fmov    $ctr1d, $ctr96_b64x                              @ CTR block 4k+9
2589
2590	aese    $ctr2b, $rk11                                    @ AES block 4k+6 - round 11
2591	fmov    $ctr1.d[1], $ctr32x                              @ CTR block 4k+9
2592	rev     $ctr32w, $rctr32w                                @ CTR block 4k+10
2593
2594	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+10
2595	ext     $acc_mb, $acc_mb, $acc_mb, #8                    @ MODULO - other mid alignment
2596	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+10
2597
2598	st1     { $res1b}, [$output_ptr], #16                    @ AES block 4k+5 - store result
2599	eor     $acc_lb, $acc_lb, $acc_hb                        @ MODULO - fold into low
2600
2601	aese    $ctr3b, $rk11                                    @ AES block 4k+7 - round 11
2602	eor     $res2b, $ctr_t2b, $ctr2b                         @ AES block 4k+6 - result
2603	fmov    $ctr2d, $ctr96_b64x                              @ CTR block 4k+10
2604
2605	st1     { $res2b}, [$output_ptr], #16                    @ AES block 4k+6 - store result
2606	fmov    $ctr2.d[1], $ctr32x                              @ CTR block 4k+10
2607	rev     $ctr32w, $rctr32w                                @ CTR block 4k+11
2608
2609	eor     $acc_lb, $acc_lb, $acc_mb                        @ MODULO - fold into low
2610	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+11
2611
2612	eor     $res3b, $ctr_t3b, $ctr3b                         @ AES block 4k+3 - result
2613	st1     { $res3b}, [$output_ptr], #16                    @ AES block 4k+3 - store result
2614	b.lt    .L192_enc_main_loop
2615
2616	.L192_enc_prepretail:                                    @ PREPRETAIL
2617	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 0
2618	rev64   $res0b, $res0b                                   @ GHASH block 4k (only t0 is free)
2619
2620	fmov    $ctr3d, $ctr96_b64x                              @ CTR block 4k+3
2621	ext     $acc_lb, $acc_lb, $acc_lb, #8                    @ PRE 0
2622	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+3
2623
2624	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 0
2625	rev64   $res1b, $res1b                                   @ GHASH block 4k+1 (t0 and t1 free)
2626
2627	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 0
2628
2629	fmov    $ctr3.d[1], $ctr32x                              @ CTR block 4k+3
2630	eor     $res0b, $res0b, $acc_lb                          @ PRE 1
2631	mov     $acc_md, $h34k.d[1]                              @ GHASH block 4k - mid
2632
2633	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 1
2634	rev64   $res2b, $res2b                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
2635
2636	pmull2  $t1.1q, $res1.2d, $h3.2d                         @ GHASH block 4k+1 - high
2637
2638	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH block 4k - low
2639	mov     $t0d, $res0.d[1]                                 @ GHASH block 4k - mid
2640
2641	pmull   $t2.1q, $res1.1d, $h3.1d                         @ GHASH block 4k+1 - low
2642	rev64   $res3b, $res3b                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2643
2644	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH block 4k - high
2645
2646	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH block 4k - mid
2647	mov     $t3d, $res1.d[1]                                 @ GHASH block 4k+1 - mid
2648
2649	eor     $acc_lb, $acc_lb, $t2.16b                        @ GHASH block 4k+1 - low
2650	mov     $t6d, $res2.d[1]                                 @ GHASH block 4k+2 - mid
2651
2652	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 0
2653	eor     $acc_hb, $acc_hb, $t1.16b                        @ GHASH block 4k+1 - high
2654
2655	pmull2  $t4.1q, $res2.2d, $h2.2d                         @ GHASH block 4k+2 - high
2656
2657	eor     $t3.8b, $t3.8b, $res1.8b                         @ GHASH block 4k+1 - mid
2658	eor     $t6.8b, $t6.8b, $res2.8b                         @ GHASH block 4k+2 - mid
2659
2660	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 1
2661
2662	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 1
2663	eor     $acc_hb, $acc_hb, $t4.16b                        @ GHASH block 4k+2 - high
2664
2665	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 1
2666
2667	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 2
2668	mov     $t9d, $res3.d[1]                                 @ GHASH block 4k+3 - mid
2669
2670	pmull2  $t7.1q, $res3.2d, $h1.2d                         @ GHASH block 4k+3 - high
2671	ins     $t6.d[1], $t6.d[0]                               @ GHASH block 4k+2 - mid
2672
2673	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 2
2674
2675	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                     @ GHASH block 4k - mid
2676	eor     $t9.8b, $t9.8b, $res3.8b                         @ GHASH block 4k+3 - mid
2677
2678	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 3
2679
2680	pmull2  $t6.1q, $t6.2d, $h12k.2d                         @ GHASH block 4k+2 - mid
2681
2682	pmull   $t3.1q, $t3.1d, $h34k.1d                         @ GHASH block 4k+1 - mid
2683
2684	pmull   $t9.1q, $t9.1d, $h12k.1d                         @ GHASH block 4k+3 - mid
2685	eor     $acc_hb, $acc_hb, $t7.16b                        @ GHASH block 4k+3 - high
2686
2687	pmull   $t5.1q, $res2.1d, $h2.1d                         @ GHASH block 4k+2 - low
2688
2689	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 3
2690	eor     $acc_mb, $acc_mb, $t3.16b                        @ GHASH block 4k+1 - mid
2691
2692	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 2
2693
2694	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 2
2695	eor     $acc_lb, $acc_lb, $t5.16b                        @ GHASH block 4k+2 - low
2696
2697	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 4
2698
2699	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 3
2700	eor     $acc_mb, $acc_mb, $t6.16b                        @ GHASH block 4k+2 - mid
2701
2702	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 3
2703
2704	pmull   $t8.1q, $res3.1d, $h1.1d                         @ GHASH block 4k+3 - low
2705	movi    $mod_constant.8b, #0xc2
2706
2707	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 4
2708
2709	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 4
2710
2711	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 4
2712	eor     $acc_mb, $acc_mb, $t9.16b                        @ GHASH block 4k+3 - mid
2713
2714	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 5
2715
2716	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 5
2717
2718	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 5
2719	eor     $acc_lb, $acc_lb, $t8.16b                        @ GHASH block 4k+3 - low
2720
2721	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 5
2722
2723	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 6
2724	eor     $acc_mb, $acc_mb, $acc_hb                        @ karatsuba tidy up
2725
2726	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 6
2727
2728	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 6
2729	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
2730
2731	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 7
2732
2733	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 7
2734	eor     $acc_mb, $acc_mb, $acc_lb
2735
2736	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 7
2737
2738	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
2739
2740	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 6
2741	ext     $acc_hb, $acc_hb, $acc_hb, #8
2742
2743	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 8
2744
2745	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 8
2746	eor     $acc_mb, $acc_mb, $t1.16b
2747
2748	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 7
2749
2750	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 8
2751
2752	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 9
2753
2754	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 8
2755	eor     $acc_mb, $acc_mb, $acc_hb
2756
2757	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 9
2758
2759	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 9
2760
2761	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 9
2762
2763	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
2764
2765	ext     $acc_mb, $acc_mb, $acc_mb, #8
2766
2767	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 10
2768
2769	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 10
2770
2771	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 10
2772
2773	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 10
2774	eor     $acc_lb, $acc_lb, $t1.16b
2775
2776	aese    $ctr0b, $rk11                                    @ AES block 4k+4 - round 11
2777
2778	aese    $ctr3b, $rk11                                    @ AES block 4k+7 - round 11
2779
2780	aese    $ctr2b, $rk11                                    @ AES block 4k+6 - round 11
2781
2782	aese    $ctr1b, $rk11                                    @ AES block 4k+5 - round 11
2783	eor     $acc_lb, $acc_lb, $acc_mb
2784	.L192_enc_tail:                                          @ TAIL
2785
2786	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr  @ main_end_input_ptr is number of bytes left to process
2787	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES block 4k+4 - load plaintext
2788#ifdef __AARCH64EB__
2789	rev     $input_l0, $input_l0
2790	rev     $input_h0, $input_h0
2791#endif
2792	eor     $input_l0, $input_l0, $rk12_l                    @ AES block 4k+4 - round 12 low
2793	eor     $input_h0, $input_h0, $rk12_h                    @ AES block 4k+4 - round 12 high
2794
2795	fmov    $ctr_t0d, $input_l0                              @ AES block 4k+4 - mov low
2796
2797	fmov    $ctr_t0.d[1], $input_h0                          @ AES block 4k+4 - mov high
2798	cmp     $main_end_input_ptr, #48
2799
2800	eor     $res1b, $ctr_t0b, $ctr0b                         @ AES block 4k+4 - result
2801
2802	ext     $t0.16b, $acc_lb, $acc_lb, #8                    @ prepare final partial tag
2803	b.gt    .L192_enc_blocks_more_than_3
2804
2805	sub     $rctr32w, $rctr32w, #1
2806	movi    $acc_m.8b, #0
2807
2808	mov     $ctr3b, $ctr2b
2809	movi    $acc_h.8b, #0
2810	cmp     $main_end_input_ptr, #32
2811
2812	mov     $ctr2b, $ctr1b
2813	movi    $acc_l.8b, #0
2814	b.gt    .L192_enc_blocks_more_than_2
2815
2816	sub     $rctr32w, $rctr32w, #1
2817
2818	mov     $ctr3b, $ctr1b
2819	cmp     $main_end_input_ptr, #16
2820	b.gt    .L192_enc_blocks_more_than_1
2821
2822	sub     $rctr32w, $rctr32w, #1
2823	b       .L192_enc_blocks_less_than_1
2824	.L192_enc_blocks_more_than_3:                            @ blocks left >  3
2825	st1     { $res1b}, [$output_ptr], #16                    @ AES final-3 block  - store result
2826
2827	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-2 block - load input low & high
2828#ifdef __AARCH64EB__
2829	rev     $input_l0, $input_l0
2830	rev     $input_h0, $input_h0
2831#endif
2832	rev64   $res0b, $res1b                                   @ GHASH final-3 block
2833
2834	eor     $input_l0, $input_l0, $rk12_l                    @ AES final-2 block - round 12 low
2835	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2836
2837	eor     $input_h0, $input_h0, $rk12_h                    @ AES final-2 block - round 12 high
2838	fmov    $res1d, $input_l0                                @ AES final-2 block - mov low
2839
2840	fmov    $res1.d[1], $input_h0                            @ AES final-2 block - mov high
2841
2842	mov     $rk4d, $res0.d[1]                                @ GHASH final-3 block - mid
2843
2844	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH final-3 block - low
2845
2846	mov     $acc_md, $h34k.d[1]                              @ GHASH final-3 block - mid
2847
2848	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-3 block - mid
2849
2850	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
2851
2852	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH final-3 block - high
2853
2854	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   @ GHASH final-3 block - mid
2855	eor     $res1b, $res1b, $ctr1b                           @ AES final-2 block - result
2856	.L192_enc_blocks_more_than_2:                            @ blocks left >  2
2857
2858	st1     { $res1b}, [$output_ptr], #16                    @ AES final-2 block - store result
2859
2860	rev64   $res0b, $res1b                                   @ GHASH final-2 block
2861	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-1 block - load input low & high
2862#ifdef __AARCH64EB__
2863	rev     $input_l0, $input_l0
2864	rev     $input_h0, $input_h0
2865#endif
2866	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2867
2868	eor     $input_h0, $input_h0, $rk12_h                    @ AES final-1 block - round 12 high
2869
2870	pmull2  $rk2q1, $res0.2d, $h3.2d                         @ GHASH final-2 block - high
2871	mov     $rk4d, $res0.d[1]                                @ GHASH final-2 block - mid
2872
2873	pmull   $rk3q1, $res0.1d, $h3.1d                         @ GHASH final-2 block - low
2874	eor     $input_l0, $input_l0, $rk12_l                    @ AES final-1 block - round 12 low
2875
2876	fmov    $res1d, $input_l0                                @ AES final-1 block - mov low
2877
2878	fmov    $res1.d[1], $input_h0                            @ AES final-1 block - mov high
2879	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-2 block - high
2880	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-2 block - mid
2881
2882	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-2 block - low
2883
2884	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     @ GHASH final-2 block - mid
2885
2886	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
2887
2888	eor     $res1b, $res1b, $ctr2b                           @ AES final-1 block - result
2889
2890	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-2 block - mid
2891	.L192_enc_blocks_more_than_1:                            @ blocks left >  1
2892
2893	st1     { $res1b}, [$output_ptr], #16                    @ AES final-1 block - store result
2894
2895	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final block - load input low & high
2896#ifdef __AARCH64EB__
2897	rev     $input_l0, $input_l0
2898	rev     $input_h0, $input_h0
2899#endif
2900	rev64   $res0b, $res1b                                   @ GHASH final-1 block
2901
2902	eor     $input_l0, $input_l0, $rk12_l                    @ AES final block - round 12 low
2903	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2904	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
2905
2906	mov     $rk4d, $res0.d[1]                                @ GHASH final-1 block - mid
2907
2908	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-1 block - mid
2909	eor     $input_h0, $input_h0, $rk12_h                    @ AES final block - round 12 high
2910	fmov    $res1d, $input_l0                                @ AES final block - mov low
2911
2912	pmull2  $rk2q1, $res0.2d, $h2.2d                         @ GHASH final-1 block - high
2913	fmov    $res1.d[1], $input_h0                            @ AES final block - mov high
2914
2915	ins     $rk4v.d[1], $rk4v.d[0]                           @ GHASH final-1 block - mid
2916
2917	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-1 block - high
2918
2919	pmull   $rk3q1, $res0.1d, $h2.1d                         @ GHASH final-1 block - low
2920
2921	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     @ GHASH final-1 block - mid
2922
2923	eor     $res1b, $res1b, $ctr3b                           @ AES final block - result
2924
2925	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-1 block - low
2926
2927	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-1 block - mid
2928	.L192_enc_blocks_less_than_1:                            @ blocks left <= 1
2929
2930	ld1     { $rk0}, [$output_ptr]                           @ load existing bytes where the possibly partial last block is to be stored
2931#ifndef __AARCH64EB__
2932	rev     $ctr32w, $rctr32w
2933#else
2934	mov     $ctr32w, $rctr32w
2935#endif
2936	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
2937
2938	sub     $bit_length, $bit_length, #128                   @ bit_length -= 128
2939	mvn     $rk12_h, xzr                                     @ rk12_h = 0xffffffffffffffff
2940
2941	neg     $bit_length, $bit_length                         @ bit_length = 128 - #bits in input (in range [1,128])
2942	mvn     $rk12_l, xzr                                     @ rk12_l = 0xffffffffffffffff
2943
2944	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
2945
2946	lsr     $rk12_h, $rk12_h, $bit_length                    @ rk12_h is mask for top 64b of last block
2947	cmp     $bit_length, #64
2948
2949	csel    $input_l0, $rk12_l, $rk12_h, lt
2950	csel    $input_h0, $rk12_h, xzr, lt
2951
2952	fmov    $ctr0d, $input_l0                                @ ctr0b is mask for last block
2953
2954	fmov    $ctr0.d[1], $input_h0
2955
2956	and     $res1b, $res1b, $ctr0b                           @ possibly partial last block has zeroes in highest bits
2957
2958	rev64   $res0b, $res1b                                   @ GHASH final block
2959
2960	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2961
2962	mov     $t0d, $res0.d[1]                                 @ GHASH final block - mid
2963
2964	pmull   $rk3q1, $res0.1d, $h1.1d                         @ GHASH final block - low
2965
2966	pmull2  $rk2q1, $res0.2d, $h1.2d                         @ GHASH final block - high
2967
2968	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH final block - mid
2969
2970	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final block - low
2971
2972	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final block - high
2973
2974	pmull   $t0.1q, $t0.1d, $h12k.1d                         @ GHASH final block - mid
2975
2976	eor     $acc_mb, $acc_mb, $t0.16b                        @ GHASH final block - mid
2977	movi    $mod_constant.8b, #0xc2
2978
2979	eor     $t9.16b, $acc_lb, $acc_hb                        @ MODULO - karatsuba tidy up
2980
2981	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
2982
2983	bif     $res1b, $rk0, $ctr0b                             @ insert existing bytes in top end of result before storing
2984
2985	eor     $acc_mb, $acc_mb, $t9.16b                        @ MODULO - karatsuba tidy up
2986
2987	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           @ MODULO - top 64b align with mid
2988
2989	ext     $acc_hb, $acc_hb, $acc_hb, #8                    @ MODULO - other top alignment
2990
2991	eor     $acc_mb, $acc_mb, $mod_t.16b                     @ MODULO - fold into mid
2992
2993	eor     $acc_mb, $acc_mb, $acc_hb                        @ MODULO - fold into mid
2994
2995	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           @ MODULO - mid 64b align with low
2996
2997	ext     $acc_mb, $acc_mb, $acc_mb, #8                    @ MODULO - other mid alignment
2998
2999	eor     $acc_lb, $acc_lb, $acc_hb                        @ MODULO - fold into low
3000	str     $ctr32w, [$counter, #12]                         @ store the updated counter
3001
3002	st1     { $res1b}, [$output_ptr]                         @ store all 16B
3003
3004	eor     $acc_lb, $acc_lb, $acc_mb                        @ MODULO - fold into low
3005	ext     $acc_lb, $acc_lb, $acc_lb, #8
3006	rev64   $acc_lb, $acc_lb
3007	mov     x0, $len
3008	st1     { $acc_l.16b }, [$current_tag]
3009
3010	ldp     x21, x22, [sp, #16]
3011	ldp     x23, x24, [sp, #32]
3012	ldp     d8, d9, [sp, #48]
3013	ldp     d10, d11, [sp, #64]
3014	ldp     d12, d13, [sp, #80]
3015	ldp     d14, d15, [sp, #96]
3016	ldp     x19, x20, [sp], #112
3017	ret
3018
3019.L192_enc_ret:
3020	mov w0, #0x0
3021	ret
3022.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
3023___
3024
3025#########################################################################################
3026# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
3027#                               size_t len,
3028#                               unsigned char *out,
3029#                               const void *key,
3030#                               unsigned char ivec[16],
3031#                               u64 *Xi);
3032#
3033$code.=<<___;
3034.global aes_gcm_dec_192_kernel
3035.type   aes_gcm_dec_192_kernel,%function
3036.align  4
3037aes_gcm_dec_192_kernel:
3038	cbz     x1, .L192_dec_ret
3039	stp     x19, x20, [sp, #-112]!
3040	mov     x16, x4
3041	mov     x8, x5
3042	stp     x21, x22, [sp, #16]
3043	stp     x23, x24, [sp, #32]
3044	stp     d8, d9, [sp, #48]
3045	stp     d10, d11, [sp, #64]
3046	stp     d12, d13, [sp, #80]
3047	stp     d14, d15, [sp, #96]
3048
3049	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
3050	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
3051#ifdef __AARCH64EB__
3052	rev     $ctr96_b64x, $ctr96_b64x
3053	rev     $ctr96_t32x, $ctr96_t32x
3054#endif
3055	ldp     $rk12_l, $rk12_h, [$cc, #192]                     @ load rk12
3056#ifdef __AARCH64EB__
3057	ror     $rk12_l, $rk12_l, #32
3058	ror     $rk12_h, $rk12_h, #32
3059#endif
3060	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
3061
3062	ld1     {$rk0s}, [$cc], #16                                  @ load rk0
3063
3064	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
3065	mov     $len, $main_end_input_ptr
3066	ld1     {$rk1s}, [$cc], #16                               @ load rk1
3067
3068	lsr     $rctr32x, $ctr96_t32x, #32
3069	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
3070	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
3071
3072	rev     $rctr32w, $rctr32w                                @ rev_ctr32
3073	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
3074
3075	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
3076	ld1     {$rk2s}, [$cc], #16                               @ load rk2
3077
3078	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
3079	rev     $ctr32w, $rctr32w                                 @ CTR block 1
3080
3081	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
3082	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
3083	ld1     {$rk3s}, [$cc], #16                               @ load rk3
3084
3085	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
3086	rev     $ctr32w, $rctr32w                                 @ CTR block 2
3087	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
3088
3089	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
3090	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
3091
3092	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
3093	rev     $ctr32w, $rctr32w                                 @ CTR block 3
3094
3095	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
3096	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
3097
3098	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
3099
3100	ld1     {$rk4s}, [$cc], #16                               @ load rk4
3101
3102	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
3103
3104	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
3105	ld1     {$rk5s}, [$cc], #16                               @ load rk5
3106
3107	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
3108	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
3109#ifndef __AARCH64EB__
3110	ext     $h4b, $h4b, $h4b, #8
3111#endif
3112	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
3113	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
3114#ifndef __AARCH64EB__
3115	ext     $h2b, $h2b, $h2b, #8
3116#endif
3117	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
3118	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
3119#ifndef __AARCH64EB__
3120	ext     $h3b, $h3b, $h3b, #8
3121#endif
3122	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
3123
3124	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
3125	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
3126#ifndef __AARCH64EB__
3127	ext     $h1b, $h1b, $h1b, #8
3128#endif
3129	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
3130	ld1     {$rk6s}, [$cc], #16                               @ load rk6
3131
3132	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
3133	ld1     {$rk7s}, [$cc], #16                               @ load rk7
3134
3135	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
3136	ld1     {$rk8s}, [$cc], #16                               @ load rk8
3137
3138	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
3139	ld1     {$rk9s}, [$cc], #16                               @ load rk9
3140
3141	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
3142	ld1     { $acc_lb}, [$current_tag]
3143	ext     $acc_lb, $acc_lb, $acc_lb, #8
3144	rev64   $acc_lb, $acc_lb
3145
3146	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
3147	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
3148
3149	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
3150	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
3151
3152	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
3153	ld1     {$rk10s}, [$cc], #16                              @ load rk10
3154
3155	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
3156	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
3157
3158	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
3159
3160	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
3161	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
3162
3163	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
3164	ld1     {$rk11s}, [$cc], #16                              @ load rk11
3165
3166	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
3167
3168	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
3169
3170	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
3171
3172	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
3173
3174	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
3175
3176	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
3177
3178	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
3179
3180	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
3181
3182	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
3183
3184	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
3185
3186	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
3187
3188	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
3189
3190	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
3191
3192	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 9
3193
3194	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 9
3195
3196	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
3197	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
3198
3199	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
3200	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3201
3202	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 10
3203	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3204
3205	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 9
3206	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
3207
3208	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 9
3209	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
3210
3211	aese    $ctr3b, $rk11                                     @ AES block 3 - round 11
3212
3213	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 10
3214
3215	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 10
3216
3217	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 10
3218	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
3219
3220	aese    $ctr2b, $rk11                                     @ AES block 2 - round 11
3221
3222	aese    $ctr1b, $rk11                                     @ AES block 1 - round 11
3223	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
3224
3225	aese    $ctr0b, $rk11                                     @ AES block 0 - round 11
3226	b.ge    .L192_dec_tail                                    @ handle tail
3227
3228	ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0,1 - load ciphertext
3229
3230	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
3231
3232	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
3233	rev     $ctr32w, $rctr32w                                 @ CTR block 4
3234	ld1     {$res2b, $res3b}, [$input_ptr], #32               @ AES block 2,3 - load ciphertext
3235
3236	mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
3237
3238	mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
3239
3240	mov     $output_l0, $ctr0.d[0]                            @ AES block 0 - mov low
3241	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
3242	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
3243
3244	mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
3245	rev64   $res0b, $res0b                                    @ GHASH block 0
3246
3247	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
3248	rev64   $res1b, $res1b                                    @ GHASH block 1
3249	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
3250
3251	eor     $output_l1, $output_l1, $rk12_l                   @ AES block 1 - round 12 low
3252#ifdef __AARCH64EB__
3253	rev     $output_l1, $output_l1
3254#endif
3255	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
3256	rev     $ctr32w, $rctr32w                                 @ CTR block 5
3257
3258	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
3259	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
3260	eor     $output_h1, $output_h1, $rk12_h                   @ AES block 1 - round 12 high
3261#ifdef __AARCH64EB__
3262	rev     $output_h1, $output_h1
3263#endif
3264	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
3265	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
3266	eor     $output_l0, $output_l0, $rk12_l                   @ AES block 0 - round 12 low
3267#ifdef __AARCH64EB__
3268	rev     $output_l0, $output_l0
3269#endif
3270	rev     $ctr32w, $rctr32w                                 @ CTR block 6
3271	eor     $output_h0, $output_h0, $rk12_h                   @ AES block 0 - round 12 high
3272#ifdef __AARCH64EB__
3273	rev     $output_h0, $output_h0
3274#endif
3275	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
3276	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
3277
3278	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
3279
3280	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
3281	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
3282	b.ge    .L192_dec_prepretail                              @ do prepretail
3283
3284	.L192_dec_main_loop:                                      @ main loop start
3285	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
3286	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
3287
3288	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
3289	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
3290
3291	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
3292	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
3293	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
3294
3295	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
3296	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
3297
3298	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
3299	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
3300
3301	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
3302	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
3303
3304	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
3305	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
3306
3307	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
3308	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
3309
3310	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
3311	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
3312	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
3313
3314	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
3315	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
3316	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
3317
3318	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
3319	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
3320
3321	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
3322	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
3323	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
3324
3325	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
3326
3327	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
3328	eor     $output_h2, $output_h2, $rk12_h                   @ AES block 4k+2 - round 12 high
3329#ifdef __AARCH64EB__
3330	rev     $output_h2, $output_h2
3331#endif
3332	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
3333	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
3334
3335	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
3336
3337	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
3338	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
3339
3340	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
3341
3342	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
3343	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
3344	eor     $output_l2, $output_l2, $rk12_l                   @ AES block 4k+2 - round 12 low
3345#ifdef __AARCH64EB__
3346	rev     $output_l2, $output_l2
3347#endif
3348	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
3349
3350	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
3351
3352	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
3353	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
3354
3355	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
3356	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
3357
3358	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
3359
3360	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
3361	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
3362
3363	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
3364
3365	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
3366
3367	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
3368	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
3369
3370	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
3371
3372	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
3373
3374	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
3375	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
3376
3377	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
3378
3379	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
3380	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
3381
3382	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
3383
3384	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
3385	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
3386
3387	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
3388
3389	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
3390	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
3391
3392	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
3393
3394	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
3395	movi    $mod_constant.8b, #0xc2
3396
3397	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
3398
3399	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
3400	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
3401
3402	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
3403
3404	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
3405	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
3406
3407	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
3408
3409	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
3410	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
3411
3412	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
3413
3414	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
3415	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
3416
3417	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
3418
3419	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
3420	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
3421
3422	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
3423
3424	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
3425	ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
3426
3427	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
3428	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
3429
3430	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
3431	ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
3432	eor     $output_l3, $output_l3, $rk12_l                   @ AES block 4k+3 - round 12 low
3433#ifdef __AARCH64EB__
3434	rev     $output_l3, $output_l3
3435#endif
3436	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
3437	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
3438
3439	aese    $ctr0b, $rk11                                     @ AES block 4k+4 - round 11
3440	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
3441
3442	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
3443	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
3444
3445	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
3446	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
3447
3448	aese    $ctr1b, $rk11                                     @ AES block 4k+5 - round 11
3449	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+7 - load ciphertext
3450	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
3451
3452	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
3453	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
3454
3455	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
3456	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
3457
3458	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
3459
3460	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
3461	eor     $output_h3, $output_h3, $rk12_h                   @ AES block 4k+3 - round 12 high
3462#ifdef __AARCH64EB__
3463	rev     $output_h3, $output_h3
3464#endif
3465	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
3466
3467	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
3468	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
3469
3470	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
3471
3472	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
3473	mov     $output_l1, $ctr1.d[0]                            @ AES block 4k+5 - mov low
3474
3475	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
3476	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
3477	rev64   $res1b, $res1b                                    @ GHASH block 4k+5
3478
3479	aese    $ctr2b, $rk11                                     @ AES block 4k+6 - round 11
3480	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
3481
3482	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
3483	mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
3484
3485	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
3486	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
3487	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
3488
3489	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
3490	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
3491	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
3492
3493	eor     $output_l0, $output_l0, $rk12_l                   @ AES block 4k+4 - round 12 low
3494#ifdef __AARCH64EB__
3495	rev     $output_l0, $output_l0
3496#endif
3497	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
3498	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
3499
3500	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
3501	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
3502	eor     $output_l1, $output_l1, $rk12_l                   @ AES block 4k+5 - round 12 low
3503#ifdef __AARCH64EB__
3504	rev     $output_l1, $output_l1
3505#endif
3506	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
3507	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
3508	eor     $output_h1, $output_h1, $rk12_h                   @ AES block 4k+5 - round 12 high
3509#ifdef __AARCH64EB__
3510	rev     $output_h1, $output_h1
3511#endif
3512	eor     $output_h0, $output_h0, $rk12_h                   @ AES block 4k+4 - round 12 high
3513#ifdef __AARCH64EB__
3514	rev     $output_h0, $output_h0
3515#endif
3516	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
3517	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
3518
3519	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
3520	rev64   $res0b, $res0b                                    @ GHASH block 4k+4
3521	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
3522
3523	aese    $ctr3b, $rk11                                     @ AES block 4k+7 - round 11
3524	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
3525	b.lt    .L192_dec_main_loop
3526
3527	.L192_dec_prepretail:                                     @ PREPRETAIL
3528	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
3529	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
3530	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
3531
3532	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
3533	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
3534
3535	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
3536	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
3537
3538	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
3539	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
3540
3541	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
3542	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
3543
3544	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
3545	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
3546
3547	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
3548	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
3549	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
3550
3551	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
3552	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
3553
3554	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
3555	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
3556	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
3557
3558	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
3559	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
3560	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
3561
3562	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
3563	eor     $output_h3, $output_h3, $rk12_h                   @ AES block 4k+3 - round 12 high
3564#ifdef __AARCH64EB__
3565	rev     $output_h3, $output_h3
3566#endif
3567	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
3568
3569	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
3570	eor     $output_l2, $output_l2, $rk12_l                   @ AES block 4k+2 - round 12 low
3571#ifdef __AARCH64EB__
3572	rev     $output_l2, $output_l2
3573#endif
3574	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
3575	eor     $output_h2, $output_h2, $rk12_h                   @ AES block 4k+2 - round 12 high
3576#ifdef __AARCH64EB__
3577	rev     $output_h2, $output_h2
3578#endif
3579	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
3580
3581	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
3582	eor     $output_l3, $output_l3, $rk12_l                   @ AES block 4k+3 - round 12 low
3583#ifdef __AARCH64EB__
3584	rev     $output_l3, $output_l3
3585#endif
3586	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
3587
3588	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
3589	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
3590
3591	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
3592	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
3593
3594	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
3595	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
3596
3597	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
3598	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
3599
3600	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
3601
3602	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
3603	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
3604
3605	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
3606
3607	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
3608	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
3609
3610	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
3611
3612	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
3613
3614	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
3615	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
3616
3617	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
3618	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
3619
3620	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
3621
3622	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
3623	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
3624
3625	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
3626
3627	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
3628	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
3629
3630	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
3631
3632	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
3633	movi    $mod_constant.8b, #0xc2
3634
3635	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
3636
3637	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
3638
3639	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
3640	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
3641
3642	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
3643	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
3644
3645	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
3646
3647	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
3648	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
3649
3650	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
3651
3652	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
3653	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
3654
3655	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
3656
3657	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
3658	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
3659
3660	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
3661
3662	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
3663	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
3664
3665	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
3666
3667	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
3668	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
3669
3670	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
3671
3672	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
3673
3674	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
3675
3676	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
3677
3678	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
3679	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
3680
3681	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
3682
3683	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
3684
3685	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
3686
3687	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
3688	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
3689
3690	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
3691
3692	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
3693
3694	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
3695
3696	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
3697
3698	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
3699
3700	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
3701
3702	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
3703
3704	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
3705
3706	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
3707	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
3708
3709	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
3710
3711	aese    $ctr0b, $rk11
3712	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
3713
3714	aese    $ctr2b, $rk11
3715
3716	aese    $ctr1b, $rk11
3717
3718	aese    $ctr3b, $rk11
3719
3720	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
3721	.L192_dec_tail:                                           @ TAIL
3722
3723	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
3724	ld1     { $res1b}, [$input_ptr], #16                      @ AES block 4k+4 - load ciphertext
3725
3726	eor     $ctr0b, $res1b, $ctr0b                            @ AES block 4k+4 - result
3727
3728	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
3729
3730	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
3731
3732	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
3733
3734	cmp     $main_end_input_ptr, #48
3735
3736	eor     $output_h0, $output_h0, $rk12_h                   @ AES block 4k+4 - round 12 high
3737#ifdef __AARCH64EB__
3738	rev     $output_h0, $output_h0
3739#endif
3740	eor     $output_l0, $output_l0, $rk12_l                   @ AES block 4k+4 - round 12 low
3741#ifdef __AARCH64EB__
3742	rev     $output_l0, $output_l0
3743#endif
3744	b.gt    .L192_dec_blocks_more_than_3
3745
3746	movi    $acc_l.8b, #0
3747	movi    $acc_h.8b, #0
3748
3749	mov     $ctr3b, $ctr2b
3750	mov     $ctr2b, $ctr1b
3751	sub     $rctr32w, $rctr32w, #1
3752
3753	movi    $acc_m.8b, #0
3754	cmp     $main_end_input_ptr, #32
3755	b.gt    .L192_dec_blocks_more_than_2
3756
3757	mov     $ctr3b, $ctr1b
3758	cmp     $main_end_input_ptr, #16
3759	sub     $rctr32w, $rctr32w, #1
3760
3761	b.gt    .L192_dec_blocks_more_than_1
3762
3763	sub     $rctr32w, $rctr32w, #1
3764	b       .L192_dec_blocks_less_than_1
3765	.L192_dec_blocks_more_than_3:                             @ blocks left >  3
3766	rev64   $res0b, $res1b                                    @ GHASH final-3 block
3767	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-2 block - load ciphertext
3768
3769	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-3 block  - store result
3770
3771	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3772
3773	eor     $ctr0b, $res1b, $ctr1b                            @ AES final-2 block - result
3774
3775	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH final-3 block - low
3776	mov     $output_l0, $ctr0.d[0]                            @ AES final-2 block - mov low
3777	mov     $rk4d, $res0.d[1]                                 @ GHASH final-3 block - mid
3778
3779	mov     $output_h0, $ctr0.d[1]                            @ AES final-2 block - mov high
3780
3781	mov     $acc_md, $h34k.d[1]                               @ GHASH final-3 block - mid
3782	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-3 block - mid
3783
3784	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH final-3 block - high
3785
3786	eor     $output_l0, $output_l0, $rk12_l                   @ AES final-2 block - round 12 low
3787#ifdef __AARCH64EB__
3788	rev     $output_l0, $output_l0
3789#endif
3790	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
3791
3792	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
3793	eor     $output_h0, $output_h0, $rk12_h                   @ AES final-2 block - round 12 high
3794#ifdef __AARCH64EB__
3795	rev     $output_h0, $output_h0
3796#endif
3797	.L192_dec_blocks_more_than_2:                             @ blocks left >  2
3798
3799	rev64   $res0b, $res1b                                    @ GHASH final-2 block
3800	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-1 block - load ciphertext
3801
3802	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3803
3804	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
3805
3806	eor     $ctr0b, $res1b, $ctr2b                            @ AES final-1 block - result
3807
3808	mov     $rk4d, $res0.d[1]                                 @ GHASH final-2 block - mid
3809
3810	pmull   $rk3q1, $res0.1d, $h3.1d                          @ GHASH final-2 block - low
3811
3812	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-2 block  - store result
3813
3814	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-2 block - mid
3815	mov     $output_h0, $ctr0.d[1]                            @ AES final-1 block - mov high
3816
3817	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-2 block - low
3818	mov     $output_l0, $ctr0.d[0]                            @ AES final-1 block - mov low
3819
3820	pmull2  $rk2q1, $res0.2d, $h3.2d                          @ GHASH final-2 block - high
3821
3822	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                      @ GHASH final-2 block - mid
3823
3824	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
3825	eor     $output_h0, $output_h0, $rk12_h                   @ AES final-1 block - round 12 high
3826#ifdef __AARCH64EB__
3827	rev     $output_h0, $output_h0
3828#endif
3829	eor     $output_l0, $output_l0, $rk12_l                   @ AES final-1 block - round 12 low
3830#ifdef __AARCH64EB__
3831	rev     $output_l0, $output_l0
3832#endif
3833	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
3834	.L192_dec_blocks_more_than_1:                             @ blocks left >  1
3835
3836	rev64   $res0b, $res1b                                    @ GHASH final-1 block
3837
3838	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3839	ld1     { $res1b}, [$input_ptr], #16                      @ AES final block - load ciphertext
3840
3841	mov     $rk4d, $res0.d[1]                                 @ GHASH final-1 block - mid
3842
3843	pmull2  $rk2q1, $res0.2d, $h2.2d                          @ GHASH final-1 block - high
3844
3845	eor     $ctr0b, $res1b, $ctr3b                            @ AES final block - result
3846	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-1 block  - store result
3847
3848	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-1 block - mid
3849
3850	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-1 block - high
3851
3852	pmull   $rk3q1, $res0.1d, $h2.1d                          @ GHASH final-1 block - low
3853	mov     $output_h0, $ctr0.d[1]                            @ AES final block - mov high
3854
3855	ins     $rk4v.d[1], $rk4v.d[0]                            @ GHASH final-1 block - mid
3856	mov     $output_l0, $ctr0.d[0]                            @ AES final block - mov low
3857
3858	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                      @ GHASH final-1 block - mid
3859
3860	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
3861	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-1 block - low
3862	eor     $output_h0, $output_h0, $rk12_h                   @ AES final block - round 12 high
3863#ifdef __AARCH64EB__
3864	rev     $output_h0, $output_h0
3865#endif
3866	eor     $output_l0, $output_l0, $rk12_l                   @ AES final block - round 12 low
3867#ifdef __AARCH64EB__
3868	rev     $output_l0, $output_l0
3869#endif
3870	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
3871	.L192_dec_blocks_less_than_1:                             @ blocks left <= 1
3872
3873	mvn     $rk12_l, xzr                                      @ rk12_l = 0xffffffffffffffff
3874	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr]  @ load existing bytes we need to not overwrite
3875	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
3876
3877	sub     $bit_length, $bit_length, #128                    @ bit_length -= 128
3878
3879	neg     $bit_length, $bit_length                          @ bit_length = 128 - #bits in input (in range [1,128])
3880
3881	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
3882	mvn     $rk12_h, xzr                                      @ rk12_h = 0xffffffffffffffff
3883
3884	lsr     $rk12_h, $rk12_h, $bit_length                     @ rk12_h is mask for top 64b of last block
3885	cmp     $bit_length, #64
3886
3887	csel    $ctr32x, $rk12_l, $rk12_h, lt
3888	csel    $ctr96_b64x, $rk12_h, xzr, lt
3889
3890	fmov    $ctr0d, $ctr32x                                   @ ctr0b is mask for last block
3891	and     $output_l0, $output_l0, $ctr32x
3892	bic     $end_input_ptr, $end_input_ptr, $ctr32x           @ mask out low existing bytes
3893
3894	orr     $output_l0, $output_l0, $end_input_ptr
3895	mov     $ctr0.d[1], $ctr96_b64x
3896#ifndef __AARCH64EB__
3897	rev     $ctr32w, $rctr32w
3898#else
3899	mov     $ctr32w, $rctr32w
3900#endif
3901
3902	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
3903	str     $ctr32w, [$counter, #12]                          @ store the updated counter
3904
3905	rev64   $res0b, $res1b                                    @ GHASH final block
3906
3907	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3908	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
3909
3910	and     $output_h0, $output_h0, $ctr96_b64x
3911
3912	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
3913	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
3914
3915	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
3916
3917	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
3918
3919	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
3920
3921	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
3922
3923	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
3924
3925	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
3926	movi    $mod_constant.8b, #0xc2
3927
3928	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
3929
3930	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
3931
3932	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
3933
3934	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
3935	orr     $output_h0, $output_h0, $main_end_input_ptr
3936	stp     $output_l0, $output_h0, [$output_ptr]
3937
3938	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
3939
3940	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
3941
3942	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
3943
3944	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
3945
3946	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
3947
3948	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
3949
3950	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
3951	ext     $acc_lb, $acc_lb, $acc_lb, #8
3952	rev64   $acc_lb, $acc_lb
3953	mov     x0, $len
3954	st1     { $acc_l.16b }, [$current_tag]
3955
3956	ldp     x21, x22, [sp, #16]
3957	ldp     x23, x24, [sp, #32]
3958	ldp     d8, d9, [sp, #48]
3959	ldp     d10, d11, [sp, #64]
3960	ldp     d12, d13, [sp, #80]
3961	ldp     d14, d15, [sp, #96]
3962	ldp     x19, x20, [sp], #112
3963	ret
3964
3965.L192_dec_ret:
3966	mov w0, #0x0
3967	ret
3968.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
3969___
3970}
3971
3972{
3973my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
3974my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
3975my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
3976my ($output_l0,$output_h0)=map("x$_",(6..7));
3977
3978my $ctr32w="w9";
3979my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
3980my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
3981
3982my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
3983my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
3984my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
3985my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
3986
3987my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
3988my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
3989my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
3990
3991my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
3992my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
3993my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
3994
3995my $t0="v8";
3996my $t0d="d8";
3997my $t1="v4";
3998my $t1d="d4";
3999my $t2="v8";
4000my $t2d="d8";
4001my $t3="v4";
4002my $t3d="d4";
4003my $t4="v4";
4004my $t4d="d4";
4005my $t5="v5";
4006my $t5d="d5";
4007my $t6="v8";
4008my $t6d="d8";
4009my $t7="v5";
4010my $t7d="d5";
4011my $t8="v6";
4012my $t8d="d6";
4013my $t9="v4";
4014my $t9d="d4";
4015
4016my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
4017my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
4018my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
4019
4020my $mod_constantd="d8";
4021my $mod_constant="v8";
4022my $mod_t="v7";
4023
4024my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
4025my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31));
4026my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
4027my $rk2q1="v20.1q";
4028my $rk3q1="v21.1q";
4029my $rk4v="v22";
4030my $rk4d="d22";
4031
4032#########################################################################################
4033# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
4034#                               size_t len,
4035#                               unsigned char *out,
4036#                               const void *key,
4037#                               unsigned char ivec[16],
4038#                               u64 *Xi);
4039#
4040$code.=<<___;
4041.global aes_gcm_enc_256_kernel
4042.type   aes_gcm_enc_256_kernel,%function
4043.align  4
4044aes_gcm_enc_256_kernel:
4045	cbz     x1, .L256_enc_ret
4046	stp     x19, x20, [sp, #-112]!
4047	mov     x16, x4
4048	mov     x8, x5
4049	stp     x21, x22, [sp, #16]
4050	stp     x23, x24, [sp, #32]
4051	stp     d8, d9, [sp, #48]
4052	stp     d10, d11, [sp, #64]
4053	stp     d12, d13, [sp, #80]
4054	stp     d14, d15, [sp, #96]
4055
4056	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
4057	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
4058	mov     $len, $main_end_input_ptr
4059	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
4060#ifdef __AARCH64EB__
4061	rev     $ctr96_b64x, $ctr96_b64x
4062	rev     $ctr96_t32x, $ctr96_t32x
4063#endif
4064	ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
4065#ifdef __AARCH64EB__
4066	ror     $rk14_l, $rk14_l, #32
4067	ror     $rk14_h, $rk14_h, #32
4068#endif
4069	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
4070	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
4071
4072	ld1     {$rk0s}, [$cc], #16                               @ load rk0
4073	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4074
4075	ld1     {$rk1s}, [$cc], #16                               @ load rk1
4076	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4077
4078	lsr     $rctr32x, $ctr96_t32x, #32
4079	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
4080	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
4081
4082	rev     $rctr32w, $rctr32w                                @ rev_ctr32
4083	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
4084	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
4085
4086	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
4087	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
4088
4089	rev     $ctr32w, $rctr32w                                 @ CTR block 1
4090	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
4091
4092	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
4093	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
4094	ld1     {$rk2s}, [$cc], #16                               @ load rk2
4095
4096	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
4097	rev     $ctr32w, $rctr32w                                 @ CTR block 2
4098	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
4099
4100	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
4101	ld1     {$rk3s}, [$cc], #16                               @ load rk3
4102
4103	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
4104	rev     $ctr32w, $rctr32w                                 @ CTR block 3
4105
4106	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
4107	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
4108
4109	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
4110
4111	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
4112	ld1     {$rk4s}, [$cc], #16                               @ load rk4
4113
4114	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
4115	ld1     {$rk5s}, [$cc], #16                               @ load rk5
4116
4117	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
4118	ld1     {$rk6s}, [$cc], #16                               @ load rk6
4119
4120	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
4121	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
4122#ifndef __AARCH64EB__
4123	ext     $h3b, $h3b, $h3b, #8
4124#endif
4125	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
4126	ld1     {$rk7s}, [$cc], #16                               @ load rk7
4127
4128	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
4129	ld1     {$rk8s}, [$cc], #16                               @ load rk8
4130
4131	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
4132	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
4133#ifndef __AARCH64EB__
4134	ext     $h2b, $h2b, $h2b, #8
4135#endif
4136	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
4137	ld1     {$rk9s}, [$cc], #16                               @ load rk9
4138
4139	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
4140	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
4141#ifndef __AARCH64EB__
4142	ext     $h4b, $h4b, $h4b, #8
4143#endif
4144	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
4145	ld1     {$rk10s}, [$cc], #16                              @ load rk10
4146
4147	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
4148	ld1     {$rk11s}, [$cc], #16                              @ load rk11
4149
4150	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
4151	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
4152
4153	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
4154
4155	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
4156	ld1     { $acc_lb}, [$current_tag]
4157	ext     $acc_lb, $acc_lb, $acc_lb, #8
4158	rev64   $acc_lb, $acc_lb
4159
4160	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
4161
4162	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
4163
4164	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
4165
4166	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
4167
4168	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
4169
4170	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
4171
4172	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
4173
4174	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
4175
4176	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
4177	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
4178
4179	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
4180	ld1     {$rk12s}, [$cc], #16                              @ load rk12
4181
4182	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
4183	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
4184#ifndef __AARCH64EB__
4185	ext     $h1b, $h1b, $h1b, #8
4186#endif
4187	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
4188	ld1     {$rk13s}, [$cc], #16                              @ load rk13
4189
4190	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
4191	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
4192
4193	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
4194
4195	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
4196
4197	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
4198	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
4199
4200	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
4201
4202	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
4203
4204	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
4205
4206	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 9
4207
4208	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 9
4209
4210	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
4211
4212	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 10
4213
4214	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 9
4215
4216	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 9
4217
4218	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 10
4219
4220	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 10
4221
4222	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 11
4223
4224	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 11
4225
4226	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 10
4227
4228	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 12
4229
4230	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 12
4231
4232	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 11
4233	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
4234
4235	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 11
4236
4237	aese    $ctr2b, $rk13                                     @ AES block 2 - round 13
4238	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
4239
4240	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 12
4241
4242	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 12
4243
4244	aese    $ctr1b, $rk13                                     @ AES block 1 - round 13
4245
4246	aese    $ctr0b, $rk13                                     @ AES block 0 - round 13
4247
4248	aese    $ctr3b, $rk13                                     @ AES block 3 - round 13
4249	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
4250	b.ge    .L256_enc_tail                                    @ handle tail
4251
4252	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 1 - load plaintext
4253#ifdef __AARCH64EB__
4254	rev     $input_l1, $input_l1
4255	rev     $input_h1, $input_h1
4256#endif
4257	rev     $ctr32w, $rctr32w                                 @ CTR block 4
4258	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 0 - load plaintext
4259#ifdef __AARCH64EB__
4260	rev     $input_l0, $input_l0
4261	rev     $input_h0, $input_h0
4262#endif
4263	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 3 - load plaintext
4264#ifdef __AARCH64EB__
4265	rev     $input_l3, $input_l3
4266	rev     $input_h3, $input_h3
4267#endif
4268	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 2 - load plaintext
4269#ifdef __AARCH64EB__
4270	rev     $input_l2, $input_l2
4271	rev     $input_h2, $input_h2
4272#endif
4273	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
4274
4275	eor     $input_l1, $input_l1, $rk14_l                     @ AES block 1 - round 14 low
4276	eor     $input_h1, $input_h1, $rk14_h                     @ AES block 1 - round 14 high
4277
4278	fmov    $ctr_t1d, $input_l1                               @ AES block 1 - mov low
4279	eor     $input_l0, $input_l0, $rk14_l                     @ AES block 0 - round 14 low
4280
4281	eor     $input_h0, $input_h0, $rk14_h                     @ AES block 0 - round 14 high
4282	eor     $input_h3, $input_h3, $rk14_h                     @ AES block 3 - round 14 high
4283	fmov    $ctr_t0d, $input_l0                               @ AES block 0 - mov low
4284
4285	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
4286	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 0 - mov high
4287	eor     $input_l3, $input_l3, $rk14_l                     @ AES block 3 - round 14 low
4288
4289	eor     $input_l2, $input_l2, $rk14_l                     @ AES block 2 - round 14 low
4290	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 1 - mov high
4291
4292	fmov    $ctr_t2d, $input_l2                               @ AES block 2 - mov low
4293	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
4294
4295	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
4296	fmov    $ctr_t3d, $input_l3                               @ AES block 3 - mov low
4297	eor     $input_h2, $input_h2, $rk14_h                     @ AES block 2 - round 14 high
4298
4299	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 2 - mov high
4300
4301	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 0 - result
4302	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
4303
4304	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
4305	rev     $ctr32w, $rctr32w                                 @ CTR block 5
4306	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
4307
4308	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 1 - result
4309	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
4310	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
4311
4312	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
4313	rev     $ctr32w, $rctr32w                                 @ CTR block 6
4314	st1     { $res0b}, [$output_ptr], #16                     @ AES block 0 - store result
4315
4316	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 3 - mov high
4317	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
4318	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 2 - result
4319
4320	st1     { $res1b}, [$output_ptr], #16                     @ AES block 1 - store result
4321
4322	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
4323	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 6
4324
4325	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 6
4326	st1     { $res2b}, [$output_ptr], #16                     @ AES block 2 - store result
4327	rev     $ctr32w, $rctr32w                                 @ CTR block 7
4328
4329	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 7
4330
4331	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 3 - result
4332	st1     { $res3b}, [$output_ptr], #16                     @ AES block 3 - store result
4333	b.ge    L256_enc_prepretail                               @ do prepretail
4334
4335	.L256_enc_main_loop:                                      @ main loop start
4336	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
4337	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
4338
4339	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
4340	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
4341
4342	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
4343	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
4344
4345	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
4346	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
4347
4348	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
4349	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 4k+7 - load plaintext
4350#ifdef __AARCH64EB__
4351	rev     $input_l3, $input_l3
4352	rev     $input_h3, $input_h3
4353#endif
4354	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
4355	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 4k+6 - load plaintext
4356#ifdef __AARCH64EB__
4357	rev     $input_l2, $input_l2
4358	rev     $input_h2, $input_h2
4359#endif
4360	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
4361	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
4362
4363	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
4364
4365	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
4366	eor     $input_l3, $input_l3, $rk14_l                     @ AES block 4k+7 - round 14 low
4367
4368	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
4369	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
4370
4371	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
4372	eor     $input_h2, $input_h2, $rk14_h                     @ AES block 4k+6 - round 14 high
4373	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
4374
4375	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
4376	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
4377
4378	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
4379
4380	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
4381	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
4382
4383	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
4384
4385	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
4386	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4387
4388	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
4389
4390	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
4391	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
4392
4393	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
4394
4395	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
4396	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
4397
4398	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
4399
4400	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
4401	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
4402
4403	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
4404
4405	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
4406	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
4407
4408	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
4409	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
4410
4411	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
4412
4413	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
4414	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
4415
4416	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
4417
4418	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
4419
4420	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
4421
4422	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
4423	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
4424
4425	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
4426
4427	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
4428
4429	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
4430
4431	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
4432	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
4433
4434	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
4435
4436	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
4437
4438	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
4439
4440	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
4441	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
4442
4443	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
4444	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 4k+5 - load plaintext
4445#ifdef __AARCH64EB__
4446	rev     $input_l1, $input_l1
4447	rev     $input_h1, $input_h1
4448#endif
4449	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
4450	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
4451
4452	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
4453	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
4454
4455	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
4456
4457	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
4458	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
4459
4460	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
4461	eor     $input_l1, $input_l1, $rk14_l                     @ AES block 4k+5 - round 14 low
4462
4463	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
4464	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
4465
4466	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
4467	eor     $input_l2, $input_l2, $rk14_l                     @ AES block 4k+6 - round 14 low
4468
4469	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
4470	movi    $mod_constant.8b, #0xc2
4471
4472	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
4473	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
4474	fmov    $ctr_t1d, $input_l1                               @ AES block 4k+5 - mov low
4475
4476	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
4477	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 4k+4 - load plaintext
4478#ifdef __AARCH64EB__
4479	rev     $input_l0, $input_l0
4480	rev     $input_h0, $input_h0
4481#endif
4482	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
4483	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
4484
4485	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
4486	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
4487
4488	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
4489
4490	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
4491	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
4492
4493	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
4494	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
4495
4496	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
4497	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
4498
4499	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
4500	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
4501
4502	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
4503	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
4504	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
4505
4506	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
4507	eor     $input_l0, $input_l0, $rk14_l                     @ AES block 4k+4 - round 14 low
4508
4509	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
4510	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
4511
4512	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
4513	eor     $input_h0, $input_h0, $rk14_h                     @ AES block 4k+4 - round 14 high
4514
4515	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
4516	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
4517	eor     $mod_t.16b, $acc_hb, $mod_t.16b                   @ MODULO - fold into mid
4518
4519	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
4520	eor     $input_h1, $input_h1, $rk14_h                     @ AES block 4k+5 - round 14 high
4521
4522	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
4523	eor     $input_h3, $input_h3, $rk14_h                     @ AES block 4k+7 - round 14 high
4524
4525	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
4526	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
4527
4528	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
4529	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
4530	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
4531
4532	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
4533	fmov    $ctr_t3d, $input_l3                               @ AES block 4k+7 - mov low
4534
4535	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
4536	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 4k+5 - mov high
4537
4538	fmov    $ctr_t2d, $input_l2                               @ AES block 4k+6 - mov low
4539	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
4540
4541	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 4k+6 - mov high
4542
4543	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            @ MODULO - mid 64b align with low
4544	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
4545	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
4546
4547	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
4548	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
4549	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
4550
4551	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 4k+5 - result
4552	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
4553	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
4554
4555	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
4556	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
4557
4558	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
4559	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
4560	st1     { $res0b}, [$output_ptr], #16                     @ AES block 4k+4 - store result
4561
4562	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
4563	eor     $acc_lb, $acc_lb, $acc_hb                         @ MODULO - fold into low
4564	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 4k+7 - mov high
4565
4566	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
4567	st1     { $res1b}, [$output_ptr], #16                     @ AES block 4k+5 - store result
4568	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
4569
4570	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
4571	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 4k+6 - result
4572	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+10
4573
4574	st1     { $res2b}, [$output_ptr], #16                     @ AES block 4k+6 - store result
4575	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+10
4576	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+11
4577
4578	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
4579	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+11
4580
4581	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 4k+7 - result
4582	st1     { $res3b}, [$output_ptr], #16                     @ AES block 4k+7 - store result
4583	b.lt    L256_enc_main_loop
4584
4585	.L256_enc_prepretail:                                     @ PREPRETAIL
4586	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
4587	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
4588
4589	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
4590	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
4591
4592	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
4593	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
4594
4595	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
4596	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
4597
4598	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
4599
4600	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
4601
4602	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
4603	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
4604
4605	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
4606
4607	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
4608	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
4609
4610	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
4611
4612	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
4613	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
4614
4615	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
4616
4617	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
4618
4619	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
4620	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
4621
4622	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
4623
4624	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
4625
4626	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
4627
4628	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
4629
4630	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
4631
4632	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
4633
4634	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
4635
4636	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
4637	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
4638
4639	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
4640	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
4641
4642	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
4643
4644	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
4645	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
4646
4647	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
4648	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4649
4650	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
4651
4652	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
4653	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
4654	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
4655
4656	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
4657
4658	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
4659
4660	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
4661	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
4662
4663	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
4664
4665	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
4666	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
4667
4668	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
4669
4670	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
4671	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
4672
4673	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
4674
4675	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
4676
4677	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
4678
4679	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
4680
4681	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
4682
4683	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
4684	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
4685
4686	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
4687
4688	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
4689
4690	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
4691
4692	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
4693	movi    $mod_constant.8b, #0xc2
4694
4695	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
4696
4697	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
4698	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
4699
4700	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
4701
4702	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
4703	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
4704
4705	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
4706	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
4707
4708	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
4709
4710	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
4711
4712	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
4713
4714	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
4715	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
4716
4717	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
4718
4719	eor     $acc_mb, $acc_mb, $acc_hb                         @ karatsuba tidy up
4720
4721	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
4722	ext     $acc_hb, $acc_hb, $acc_hb, #8
4723
4724	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
4725
4726	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
4727	eor     $acc_mb, $acc_mb, $acc_lb
4728
4729	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
4730
4731	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
4732
4733	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
4734
4735	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
4736	eor     $acc_mb, $acc_mb, $t1.16b
4737
4738	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
4739
4740	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
4741
4742	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
4743
4744	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
4745	eor     $acc_mb, $acc_mb, $acc_hb
4746
4747	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
4748
4749	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
4750
4751	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
4752
4753	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
4754
4755	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
4756	ext     $acc_mb, $acc_mb, $acc_mb, #8
4757
4758	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
4759
4760	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
4761	eor     $acc_lb, $acc_lb, $t1.16b
4762
4763	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
4764
4765	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
4766
4767	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
4768
4769	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
4770	eor     $acc_lb, $acc_lb, $acc_mb
4771	.L256_enc_tail:                                           @ TAIL
4772
4773	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
4774	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
4775	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES block 4k+4 - load plaintext
4776#ifdef __AARCH64EB__
4777	rev     $input_l0, $input_l0
4778	rev     $input_h0, $input_h0
4779#endif
4780	eor     $input_l0, $input_l0, $rk14_l                     @ AES block 4k+4 - round 14 low
4781	eor     $input_h0, $input_h0, $rk14_h                     @ AES block 4k+4 - round 14 high
4782
4783	cmp     $main_end_input_ptr, #48
4784	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
4785
4786	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
4787
4788	eor     $res1b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
4789	b.gt    .L256_enc_blocks_more_than_3
4790
4791	cmp     $main_end_input_ptr, #32
4792	mov     $ctr3b, $ctr2b
4793	movi    $acc_l.8b, #0
4794
4795	movi    $acc_h.8b, #0
4796	sub     $rctr32w, $rctr32w, #1
4797
4798	mov     $ctr2b, $ctr1b
4799	movi    $acc_m.8b, #0
4800	b.gt    .L256_enc_blocks_more_than_2
4801
4802	mov     $ctr3b, $ctr1b
4803	sub     $rctr32w, $rctr32w, #1
4804	cmp     $main_end_input_ptr, #16
4805
4806	b.gt    .L256_enc_blocks_more_than_1
4807
4808	sub     $rctr32w, $rctr32w, #1
4809	b       .L256_enc_blocks_less_than_1
4810	.L256_enc_blocks_more_than_3:                            @ blocks left >  3
4811	st1     { $res1b}, [$output_ptr], #16                    @ AES final-3 block  - store result
4812
4813	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-2 block - load input low & high
4814#ifdef __AARCH64EB__
4815	rev     $input_l0, $input_l0
4816	rev     $input_h0, $input_h0
4817#endif
4818	rev64   $res0b, $res1b                                   @ GHASH final-3 block
4819
4820	eor     $input_l0, $input_l0, $rk14_l                    @ AES final-2 block - round 14 low
4821	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4822
4823	eor     $input_h0, $input_h0, $rk14_h                    @ AES final-2 block - round 14 high
4824
4825	mov     $rk4d, $res0.d[1]                                @ GHASH final-3 block - mid
4826	fmov    $res1d, $input_l0                                @ AES final-2 block - mov low
4827
4828	fmov    $res1.d[1], $input_h0                            @ AES final-2 block - mov high
4829
4830	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-3 block - mid
4831	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
4832
4833	mov     $acc_md, $h34k.d[1]                              @ GHASH final-3 block - mid
4834
4835	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH final-3 block - low
4836
4837	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH final-3 block - high
4838
4839	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   @ GHASH final-3 block - mid
4840	eor     $res1b, $res1b, $ctr1b                           @ AES final-2 block - result
4841	.L256_enc_blocks_more_than_2:                            @ blocks left >  2
4842
4843	st1     { $res1b}, [$output_ptr], #16                    @ AES final-2 block - store result
4844
4845	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-1 block - load input low & high
4846#ifdef __AARCH64EB__
4847	rev     $input_l0, $input_l0
4848	rev     $input_h0, $input_h0
4849#endif
4850	rev64   $res0b, $res1b                                   @ GHASH final-2 block
4851
4852	eor     $input_l0, $input_l0, $rk14_l                    @ AES final-1 block - round 14 low
4853	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4854
4855	fmov    $res1d, $input_l0                                @ AES final-1 block - mov low
4856	eor     $input_h0, $input_h0, $rk14_h                    @ AES final-1 block - round 14 high
4857
4858	fmov    $res1.d[1], $input_h0                            @ AES final-1 block - mov high
4859
4860	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
4861
4862	pmull2  $rk2q1, $res0.2d, $h3.2d                         @ GHASH final-2 block - high
4863	mov     $rk4d, $res0.d[1]                                @ GHASH final-2 block - mid
4864
4865	pmull   $rk3q1, $res0.1d, $h3.1d                         @ GHASH final-2 block - low
4866
4867	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-2 block - mid
4868
4869	eor     $res1b, $res1b, $ctr2b                           @ AES final-1 block - result
4870
4871	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-2 block - high
4872
4873	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     @ GHASH final-2 block - mid
4874
4875	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-2 block - low
4876
4877	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-2 block - mid
4878	.L256_enc_blocks_more_than_1:                            @ blocks left >  1
4879
4880	st1     { $res1b}, [$output_ptr], #16                    @ AES final-1 block - store result
4881
4882	rev64   $res0b, $res1b                                   @ GHASH final-1 block
4883
4884	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final block - load input low & high
4885#ifdef __AARCH64EB__
4886	rev     $input_l0, $input_l0
4887	rev     $input_h0, $input_h0
4888#endif
4889	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4890
4891	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
4892
4893	eor     $input_l0, $input_l0, $rk14_l                    @ AES final block - round 14 low
4894	mov     $rk4d, $res0.d[1]                                @ GHASH final-1 block - mid
4895
4896	pmull2  $rk2q1, $res0.2d, $h2.2d                         @ GHASH final-1 block - high
4897	eor     $input_h0, $input_h0, $rk14_h                    @ AES final block - round 14 high
4898
4899	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-1 block - mid
4900
4901	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-1 block - high
4902
4903	ins     $rk4v.d[1], $rk4v.d[0]                           @ GHASH final-1 block - mid
4904	fmov    $res1d, $input_l0                                @ AES final block - mov low
4905
4906	fmov    $res1.d[1], $input_h0                            @ AES final block - mov high
4907
4908	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     @ GHASH final-1 block - mid
4909
4910	pmull   $rk3q1, $res0.1d, $h2.1d                         @ GHASH final-1 block - low
4911
4912	eor     $res1b, $res1b, $ctr3b                           @ AES final block - result
4913	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-1 block - mid
4914
4915	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-1 block - low
4916	.L256_enc_blocks_less_than_1:                            @ blocks left <= 1
4917
4918	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
4919
4920	mvn     $rk14_l, xzr                                     @ rk14_l = 0xffffffffffffffff
4921	sub     $bit_length, $bit_length, #128                   @ bit_length -= 128
4922
4923	neg     $bit_length, $bit_length                         @ bit_length = 128 - #bits in input (in range [1,128])
4924	ld1     { $rk0}, [$output_ptr]                           @ load existing bytes where the possibly partial last block is to be stored
4925
4926	mvn     $rk14_h, xzr                                     @ rk14_h = 0xffffffffffffffff
4927	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
4928
4929	lsr     $rk14_h, $rk14_h, $bit_length                    @ rk14_h is mask for top 64b of last block
4930	cmp     $bit_length, #64
4931
4932	csel    $input_l0, $rk14_l, $rk14_h, lt
4933	csel    $input_h0, $rk14_h, xzr, lt
4934
4935	fmov    $ctr0d, $input_l0                                @ ctr0b is mask for last block
4936
4937	fmov    $ctr0.d[1], $input_h0
4938
4939	and     $res1b, $res1b, $ctr0b                           @ possibly partial last block has zeroes in highest bits
4940
4941	rev64   $res0b, $res1b                                   @ GHASH final block
4942
4943	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4944
4945	bif     $res1b, $rk0, $ctr0b                             @ insert existing bytes in top end of result before storing
4946
4947	pmull2  $rk2q1, $res0.2d, $h1.2d                         @ GHASH final block - high
4948	mov     $t0d, $res0.d[1]                                 @ GHASH final block - mid
4949#ifndef __AARCH64EB__
4950	rev     $ctr32w, $rctr32w
4951#else
4952	mov     $ctr32w, $rctr32w
4953#endif
4954
4955	pmull   $rk3q1, $res0.1d, $h1.1d                         @ GHASH final block - low
4956
4957	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final block - high
4958	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH final block - mid
4959
4960	pmull   $t0.1q, $t0.1d, $h12k.1d                         @ GHASH final block - mid
4961
4962	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final block - low
4963
4964	eor     $acc_mb, $acc_mb, $t0.16b                        @ GHASH final block - mid
4965	movi    $mod_constant.8b, #0xc2
4966
4967	eor     $t9.16b, $acc_lb, $acc_hb                        @ MODULO - karatsuba tidy up
4968
4969	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
4970
4971	eor     $acc_mb, $acc_mb, $t9.16b                        @ MODULO - karatsuba tidy up
4972
4973	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           @ MODULO - top 64b align with mid
4974
4975	ext     $acc_hb, $acc_hb, $acc_hb, #8                    @ MODULO - other top alignment
4976
4977	eor     $acc_mb, $acc_mb, $mod_t.16b                     @ MODULO - fold into mid
4978
4979	eor     $acc_mb, $acc_mb, $acc_hb                        @ MODULO - fold into mid
4980
4981	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           @ MODULO - mid 64b align with low
4982
4983	ext     $acc_mb, $acc_mb, $acc_mb, #8                    @ MODULO - other mid alignment
4984
4985	str     $ctr32w, [$counter, #12]                         @ store the updated counter
4986
4987	st1     { $res1b}, [$output_ptr]                         @ store all 16B
4988	eor     $acc_lb, $acc_lb, $acc_hb                        @ MODULO - fold into low
4989
4990	eor     $acc_lb, $acc_lb, $acc_mb                        @ MODULO - fold into low
4991	ext     $acc_lb, $acc_lb, $acc_lb, #8
4992	rev64   $acc_lb, $acc_lb
4993	mov     x0, $len
4994	st1     { $acc_l.16b }, [$current_tag]
4995
4996	ldp     x21, x22, [sp, #16]
4997	ldp     x23, x24, [sp, #32]
4998	ldp     d8, d9, [sp, #48]
4999	ldp     d10, d11, [sp, #64]
5000	ldp     d12, d13, [sp, #80]
5001	ldp     d14, d15, [sp, #96]
5002	ldp     x19, x20, [sp], #112
5003	ret
5004
5005.L256_enc_ret:
5006	mov w0, #0x0
5007	ret
5008.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
5009___
5010
5011{
5012my $t8="v4";
5013my $t8d="d4";
5014my $t9="v6";
5015my $t9d="d6";
5016#########################################################################################
5017# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
5018#                               size_t len,
5019#                               unsigned char *out,
5020#                               const void *key,
5021#                               unsigned char ivec[16],
5022#                               u64 *Xi);
5023#
5024$code.=<<___;
5025.global aes_gcm_dec_256_kernel
5026.type   aes_gcm_dec_256_kernel,%function
5027.align  4
5028aes_gcm_dec_256_kernel:
5029	cbz     x1, .L256_dec_ret
5030	stp     x19, x20, [sp, #-112]!
5031	mov     x16, x4
5032	mov     x8, x5
5033	stp     x21, x22, [sp, #16]
5034	stp     x23, x24, [sp, #32]
5035	stp     d8, d9, [sp, #48]
5036	stp     d10, d11, [sp, #64]
5037	stp     d12, d13, [sp, #80]
5038	stp     d14, d15, [sp, #96]
5039
5040	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
5041	mov     $len, $main_end_input_ptr
5042	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
5043#ifdef __AARCH64EB__
5044	rev     $ctr96_b64x, $ctr96_b64x
5045	rev     $ctr96_t32x, $ctr96_t32x
5046#endif
5047	ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
5048#ifdef __AARCH64EB__
5049	ror     $rk14_h, $rk14_h, #32
5050	ror     $rk14_l, $rk14_l, #32
5051#endif
5052	ld1     {$rk0s}, [$cc], #16                               @ load rk0
5053	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
5054
5055	ld1     {$rk1s}, [$cc], #16                               @ load rk1
5056	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5057
5058	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
5059	ld1     {$rk2s}, [$cc], #16                               @ load rk2
5060
5061	lsr     $rctr32x, $ctr96_t32x, #32
5062	ld1     {$rk3s}, [$cc], #16                               @ load rk3
5063	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
5064
5065	ld1     {$rk4s}, [$cc], #16                               @ load rk4
5066	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
5067	rev     $rctr32w, $rctr32w                                @ rev_ctr32
5068
5069	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
5070	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
5071
5072	rev     $ctr32w, $rctr32w                                 @ CTR block 1
5073	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
5074	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
5075
5076	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
5077	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
5078
5079	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
5080	rev     $ctr32w, $rctr32w                                 @ CTR block 2
5081	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
5082
5083	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
5084	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
5085
5086	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
5087	rev     $ctr32w, $rctr32w                                 @ CTR block 3
5088
5089	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
5090	ld1     {$rk5s}, [$cc], #16                               @ load rk5
5091
5092	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
5093	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
5094
5095	ld1     {$rk6s}, [$cc], #16                               @ load rk6
5096
5097	ld1     {$rk7s}, [$cc], #16                               @ load rk7
5098
5099	ld1     {$rk8s}, [$cc], #16                               @ load rk8
5100
5101	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
5102	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
5103#ifndef __AARCH64EB__
5104	ext     $h3b, $h3b, $h3b, #8
5105#endif
5106
5107	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
5108	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
5109#ifndef __AARCH64EB__
5110	ext     $h4b, $h4b, $h4b, #8
5111#endif
5112
5113	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
5114	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
5115#ifndef __AARCH64EB__
5116	ext     $h2b, $h2b, $h2b, #8
5117#endif
5118
5119	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
5120	ld1     {$rk9s}, [$cc], #16                                 @ load rk9
5121
5122	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
5123
5124	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
5125	ld1     { $acc_lb}, [$current_tag]
5126	ext     $acc_lb, $acc_lb, $acc_lb, #8
5127	rev64   $acc_lb, $acc_lb
5128
5129	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
5130	ld1     {$rk10s}, [$cc], #16                              @ load rk10
5131
5132	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
5133	ld1     {$rk11s}, [$cc], #16                              @ load rk11
5134
5135	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
5136	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
5137#ifndef __AARCH64EB__
5138	ext     $h1b, $h1b, $h1b, #8
5139#endif
5140	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
5141	ld1     {$rk12s}, [$cc], #16                              @ load rk12
5142
5143	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
5144
5145	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
5146
5147	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
5148
5149	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
5150
5151	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
5152	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
5153
5154	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
5155
5156	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
5157
5158	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
5159
5160	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
5161
5162	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
5163
5164	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
5165
5166	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
5167
5168	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
5169
5170	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
5171
5172	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
5173
5174	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
5175
5176	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
5177
5178	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
5179
5180	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
5181
5182	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
5183
5184	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
5185
5186	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
5187
5188	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
5189
5190	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
5191
5192	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
5193
5194	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 9
5195
5196	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
5197	ld1     {$rk13s}, [$cc], #16                             @ load rk13
5198
5199	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 9
5200
5201	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 10
5202
5203	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 9
5204
5205	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 10
5206
5207	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 9
5208
5209	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 10
5210
5211	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 11
5212
5213	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 10
5214
5215	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 11
5216
5217	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 11
5218
5219	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 11
5220
5221	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
5222
5223	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
5224
5225	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
5226	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
5227
5228	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 12
5229
5230	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 12
5231
5232	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 12
5233
5234	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 12
5235	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
5236
5237	aese    $ctr1b, $rk13                                     @ AES block 1 - round 13
5238
5239	aese    $ctr2b, $rk13                                     @ AES block 2 - round 13
5240	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
5241
5242	aese    $ctr3b, $rk13                                     @ AES block 3 - round 13
5243
5244	aese    $ctr0b, $rk13                                     @ AES block 0 - round 13
5245	b.ge    .L256_dec_tail                                    @ handle tail
5246
5247	ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0,1 - load ciphertext
5248
5249	rev     $ctr32w, $rctr32w                                 @ CTR block 4
5250
5251	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
5252
5253	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
5254	rev64   $res1b, $res1b                                    @ GHASH block 1
5255	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 2 - load ciphertext
5256
5257	mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
5258
5259	mov     $output_l0, $ctr0.d[0]                            @ AES block 0 - mov low
5260	rev64   $res0b, $res0b                                    @ GHASH block 0
5261	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
5262
5263	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
5264	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
5265
5266	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
5267	rev     $ctr32w, $rctr32w                                 @ CTR block 5
5268	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
5269
5270	mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
5271
5272	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
5273	mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
5274	eor     $output_h0, $output_h0, $rk14_h                   @ AES block 0 - round 14 high
5275#ifdef __AARCH64EB__
5276	rev     $output_h0, $output_h0
5277#endif
5278	eor     $output_l0, $output_l0, $rk14_l                   @ AES block 0 - round 14 low
5279#ifdef __AARCH64EB__
5280	rev     $output_l0, $output_l0
5281#endif
5282	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
5283	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
5284
5285	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 3 - load ciphertext
5286
5287	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
5288	rev     $ctr32w, $rctr32w                                 @ CTR block 6
5289	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
5290
5291	eor     $output_l1, $output_l1, $rk14_l                   @ AES block 1 - round 14 low
5292#ifdef __AARCH64EB__
5293	rev     $output_l1, $output_l1
5294#endif
5295	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
5296
5297	eor     $output_h1, $output_h1, $rk14_h                   @ AES block 1 - round 14 high
5298#ifdef __AARCH64EB__
5299	rev     $output_h1, $output_h1
5300#endif
5301	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
5302
5303	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
5304	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
5305	b.ge    .L256_dec_prepretail                              @ do prepretail
5306
5307	.L256_dec_main_loop:                                      @ main loop start
5308	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
5309	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
5310	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
5311
5312	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
5313	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
5314
5315	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
5316	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
5317
5318	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
5319	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
5320	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
5321
5322	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
5323	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
5324
5325	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
5326	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
5327
5328	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
5329	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
5330	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
5331
5332	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
5333	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
5334
5335	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
5336	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
5337
5338	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
5339	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
5340
5341	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
5342	eor     $output_h2, $output_h2, $rk14_h                   @ AES block 4k+2 - round 14 high
5343#ifdef __AARCH64EB__
5344	rev     $output_h2, $output_h2
5345#endif
5346	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
5347	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
5348
5349	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
5350	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
5351
5352	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
5353	eor     $output_l2, $output_l2, $rk14_l                   @ AES block 4k+2 - round 14 low
5354#ifdef __AARCH64EB__
5355	rev     $output_l2, $output_l2
5356#endif
5357	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
5358	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
5359
5360	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
5361
5362	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
5363
5364	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
5365	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
5366
5367	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
5368	eor     $output_l3, $output_l3, $rk14_l                   @ AES block 4k+3 - round 14 low
5369#ifdef __AARCH64EB__
5370	rev     $output_l3, $output_l3
5371#endif
5372	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
5373	eor     $output_h3, $output_h3, $rk14_h                   @ AES block 4k+3 - round 14 high
5374#ifdef __AARCH64EB__
5375	rev     $output_h3, $output_h3
5376#endif
5377	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
5378
5379	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
5380
5381	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
5382	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
5383
5384	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
5385	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
5386
5387	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
5388	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
5389
5390	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
5391	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
5392
5393	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
5394	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
5395
5396	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
5397
5398	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
5399	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
5400
5401	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
5402
5403	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
5404	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
5405
5406	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
5407	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
5408
5409	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
5410	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
5411
5412	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
5413	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
5414
5415	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
5416
5417	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
5418	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
5419
5420	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
5421
5422	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
5423	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
5424
5425	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
5426
5427	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
5428
5429	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
5430	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
5431
5432	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
5433
5434	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
5435	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
5436	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
5437
5438	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
5439
5440	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
5441	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
5442
5443	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
5444
5445	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
5446	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
5447
5448	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
5449
5450	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
5451	movi    $mod_constant.8b, #0xc2
5452
5453	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
5454	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
5455
5456	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
5457
5458	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
5459	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
5460
5461	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
5462	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
5463
5464	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
5465
5466	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
5467	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
5468
5469	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
5470	ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
5471
5472	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
5473	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
5474
5475	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
5476	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
5477
5478	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
5479	ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
5480
5481	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
5482	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
5483
5484	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
5485	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
5486
5487	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
5488	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
5489
5490	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
5491	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
5492
5493	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
5494	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+7 - load ciphertext
5495
5496	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
5497	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
5498
5499	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
5500	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
5501
5502	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
5503	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
5504
5505	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
5506	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
5507
5508	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
5509	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
5510
5511	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
5512	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
5513	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
5514
5515	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
5516	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
5517	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
5518
5519	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
5520
5521	eor     $output_l0, $output_l0, $rk14_l                   @ AES block 4k+4 - round 14 low
5522#ifdef __AARCH64EB__
5523	rev     $output_l0, $output_l0
5524#endif
5525	eor     $output_h0, $output_h0, $rk14_h                   @ AES block 4k+4 - round 14 high
5526#ifdef __AARCH64EB__
5527	rev     $output_h0, $output_h0
5528#endif
5529	mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
5530	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
5531	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
5532
5533	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
5534	mov     $output_l1, $ctr1.d[0]                            @ AES block 4k+5 - mov low
5535
5536	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
5537	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
5538
5539	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
5540	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
5541	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
5542
5543	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
5544	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
5545
5546	rev64   $res1b, $res1b                                    @ GHASH block 4k+5
5547	eor     $output_h1, $output_h1, $rk14_h                   @ AES block 4k+5 - round 14 high
5548#ifdef __AARCH64EB__
5549	rev     $output_h1, $output_h1
5550#endif
5551	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
5552
5553	eor     $output_l1, $output_l1, $rk14_l                   @ AES block 4k+5 - round 14 low
5554#ifdef __AARCH64EB__
5555	rev     $output_l1, $output_l1
5556#endif
5557	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
5558
5559	rev64   $res0b, $res0b                                    @ GHASH block 4k+4
5560	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
5561	b.lt    .L256_dec_main_loop
5562
5563
5564	.L256_dec_prepretail:                                     @ PREPRETAIL
5565	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
5566	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
5567	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
5568
5569	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
5570	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
5571
5572	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
5573	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
5574
5575	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
5576	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
5577	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
5578
5579	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
5580	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
5581	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
5582
5583	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
5584	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
5585
5586	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
5587	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
5588	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
5589
5590	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
5591	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
5592
5593	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
5594	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
5595
5596	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
5597	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
5598
5599	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
5600
5601	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
5602	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
5603
5604	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
5605
5606	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
5607	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
5608
5609	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
5610
5611	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
5612	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
5613
5614	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
5615
5616	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
5617	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
5618
5619	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
5620
5621	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
5622	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
5623
5624	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
5625	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
5626
5627	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
5628
5629	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
5630
5631	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
5632	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
5633
5634	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
5635
5636	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
5637	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
5638
5639	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
5640
5641	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
5642	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
5643
5644	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
5645
5646	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
5647	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
5648
5649	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
5650
5651	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
5652	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
5653
5654	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
5655
5656	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
5657	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
5658
5659	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
5660
5661	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
5662
5663	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
5664	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
5665
5666	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
5667
5668	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
5669	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
5670
5671	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
5672
5673	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
5674	movi    $mod_constant.8b, #0xc2
5675
5676	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
5677	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
5678
5679	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
5680
5681	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
5682	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
5683
5684	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
5685
5686	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
5687	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
5688
5689	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
5690
5691	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
5692	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
5693
5694	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
5695
5696	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
5697	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
5698
5699	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
5700
5701	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
5702	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
5703
5704	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
5705
5706	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
5707	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
5708
5709	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
5710
5711	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
5712	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
5713
5714	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
5715
5716	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
5717
5718	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
5719	eor     $output_h2, $output_h2, $rk14_h                   @ AES block 4k+2 - round 14 high
5720#ifdef __AARCH64EB__
5721	rev     $output_h2, $output_h2
5722#endif
5723	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
5724	eor     $output_l3, $output_l3, $rk14_l                   @ AES block 4k+3 - round 14 low
5725#ifdef __AARCH64EB__
5726	rev     $output_l3, $output_l3
5727#endif
5728	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
5729	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
5730
5731	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
5732	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
5733
5734	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
5735	eor     $output_l2, $output_l2, $rk14_l                   @ AES block 4k+2 - round 14 low
5736#ifdef __AARCH64EB__
5737	rev     $output_l2, $output_l2
5738#endif
5739
5740	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
5741
5742	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
5743	eor     $output_h3, $output_h3, $rk14_h                   @ AES block 4k+3 - round 14 high
5744#ifdef __AARCH64EB__
5745	rev     $output_h3, $output_h3
5746#endif
5747
5748	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
5749	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
5750
5751	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
5752	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
5753
5754	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
5755	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
5756
5757	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
5758	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
5759
5760	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
5761
5762	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
5763
5764	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
5765
5766	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
5767	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
5768	.L256_dec_tail:                                           @ TAIL
5769
5770	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
5771	ld1     { $res1b}, [$input_ptr], #16                      @ AES block 4k+4 - load ciphertext
5772
5773	eor     $ctr0b, $res1b, $ctr0b                            @ AES block 4k+4 - result
5774
5775	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
5776
5777	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
5778	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
5779
5780	cmp     $main_end_input_ptr, #48
5781
5782	eor     $output_l0, $output_l0, $rk14_l                   @ AES block 4k+4 - round 14 low
5783#ifdef __AARCH64EB__
5784	rev     $output_l0, $output_l0
5785#endif
5786
5787	eor     $output_h0, $output_h0, $rk14_h                   @ AES block 4k+4 - round 14 high
5788#ifdef __AARCH64EB__
5789	rev     $output_h0, $output_h0
5790#endif
5791	b.gt    .L256_dec_blocks_more_than_3
5792
5793	sub     $rctr32w, $rctr32w, #1
5794	mov     $ctr3b, $ctr2b
5795	movi    $acc_m.8b, #0
5796
5797	movi    $acc_l.8b, #0
5798	cmp     $main_end_input_ptr, #32
5799
5800	movi    $acc_h.8b, #0
5801	mov     $ctr2b, $ctr1b
5802	b.gt    .L256_dec_blocks_more_than_2
5803
5804	sub     $rctr32w, $rctr32w, #1
5805
5806	mov     $ctr3b, $ctr1b
5807	cmp     $main_end_input_ptr, #16
5808	b.gt    .L256_dec_blocks_more_than_1
5809
5810	sub     $rctr32w, $rctr32w, #1
5811	b       .L256_dec_blocks_less_than_1
5812	.L256_dec_blocks_more_than_3:                            @ blocks left >  3
5813	rev64   $res0b, $res1b                                   @ GHASH final-3 block
5814	ld1     { $res1b}, [$input_ptr], #16                     @ AES final-2 block - load ciphertext
5815
5816	stp     $output_l0, $output_h0, [$output_ptr], #16       @ AES final-3 block  - store result
5817
5818	mov     $acc_md, $h34k.d[1]                              @ GHASH final-3 block - mid
5819
5820	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
5821
5822	eor     $ctr0b, $res1b, $ctr1b                           @ AES final-2 block - result
5823
5824	mov     $rk4d, $res0.d[1]                                @ GHASH final-3 block - mid
5825
5826	mov     $output_l0, $ctr0.d[0]                           @ AES final-2 block - mov low
5827
5828	mov     $output_h0, $ctr0.d[1]                           @ AES final-2 block - mov high
5829
5830	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-3 block - mid
5831
5832	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
5833
5834	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH final-3 block - high
5835
5836	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   @ GHASH final-3 block - mid
5837	eor     $output_l0, $output_l0, $rk14_l                  @ AES final-2 block - round 14 low
5838#ifdef __AARCH64EB__
5839	rev     $output_l0, $output_l0
5840#endif
5841
5842	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH final-3 block - low
5843	eor     $output_h0, $output_h0, $rk14_h                  @ AES final-2 block - round 14 high
5844#ifdef __AARCH64EB__
5845	rev     $output_h0, $output_h0
5846#endif
5847	.L256_dec_blocks_more_than_2:                            @ blocks left >  2
5848
5849	rev64   $res0b, $res1b                                   @ GHASH final-2 block
5850	ld1     { $res1b}, [$input_ptr], #16                     @ AES final-1 block - load ciphertext
5851
5852	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
5853	stp     $output_l0, $output_h0, [$output_ptr], #16       @ AES final-2 block  - store result
5854
5855	eor     $ctr0b, $res1b, $ctr2b                           @ AES final-1 block - result
5856
5857	mov     $rk4d, $res0.d[1]                                @ GHASH final-2 block - mid
5858
5859	pmull   $rk3q1, $res0.1d, $h3.1d                         @ GHASH final-2 block - low
5860
5861	pmull2  $rk2q1, $res0.2d, $h3.2d                         @ GHASH final-2 block - high
5862
5863	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-2 block - mid
5864	mov     $output_l0, $ctr0.d[0]                           @ AES final-1 block - mov low
5865
5866	mov     $output_h0, $ctr0.d[1]                           @ AES final-1 block - mov high
5867	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-2 block - low
5868	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
5869
5870	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     @ GHASH final-2 block - mid
5871
5872	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-2 block - high
5873	eor     $output_l0, $output_l0, $rk14_l                  @ AES final-1 block - round 14 low
5874#ifdef __AARCH64EB__
5875	rev     $output_l0, $output_l0
5876#endif
5877
5878	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-2 block - mid
5879	eor     $output_h0, $output_h0, $rk14_h                  @ AES final-1 block - round 14 high
5880#ifdef __AARCH64EB__
5881	rev     $output_h0, $output_h0
5882#endif
5883	.L256_dec_blocks_more_than_1:                            @ blocks left >  1
5884
5885	stp     $output_l0, $output_h0, [$output_ptr], #16       @ AES final-1 block  - store result
5886	rev64   $res0b, $res1b                                   @ GHASH final-1 block
5887
5888	ld1     { $res1b}, [$input_ptr], #16                     @ AES final block - load ciphertext
5889
5890	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
5891	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
5892
5893	mov     $rk4d, $res0.d[1]                                @ GHASH final-1 block - mid
5894
5895	eor     $ctr0b, $res1b, $ctr3b                           @ AES final block - result
5896
5897	pmull2  $rk2q1, $res0.2d, $h2.2d                         @ GHASH final-1 block - high
5898
5899	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-1 block - mid
5900
5901	pmull   $rk3q1, $res0.1d, $h2.1d                         @ GHASH final-1 block - low
5902	mov     $output_l0, $ctr0.d[0]                           @ AES final block - mov low
5903
5904	ins     $rk4v.d[1], $rk4v.d[0]                           @ GHASH final-1 block - mid
5905
5906	mov     $output_h0, $ctr0.d[1]                           @ AES final block - mov high
5907
5908	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     @ GHASH final-1 block - mid
5909	eor     $output_l0, $output_l0, $rk14_l                  @ AES final block - round 14 low
5910#ifdef __AARCH64EB__
5911	rev     $output_l0, $output_l0
5912#endif
5913	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-1 block - low
5914
5915	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-1 block - high
5916
5917	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-1 block - mid
5918	eor     $output_h0, $output_h0, $rk14_h                  @ AES final block - round 14 high
5919#ifdef __AARCH64EB__
5920	rev     $output_h0, $output_h0
5921#endif
5922	.L256_dec_blocks_less_than_1:                            @ blocks left <= 1
5923
5924	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
5925	mvn     $rk14_h, xzr                                     @ rk14_h = 0xffffffffffffffff
5926
5927	sub     $bit_length, $bit_length, #128                   @ bit_length -= 128
5928	mvn     $rk14_l, xzr                                     @ rk14_l = 0xffffffffffffffff
5929
5930	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
5931	neg     $bit_length, $bit_length                         @ bit_length = 128 - #bits in input (in range [1,128])
5932
5933	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
5934
5935	lsr     $rk14_h, $rk14_h, $bit_length                    @ rk14_h is mask for top 64b of last block
5936	cmp     $bit_length, #64
5937
5938	csel    $ctr32x, $rk14_l, $rk14_h, lt
5939	csel    $ctr96_b64x, $rk14_h, xzr, lt
5940
5941	fmov    $ctr0d, $ctr32x                                  @ ctr0b is mask for last block
5942	and     $output_l0, $output_l0, $ctr32x
5943
5944	mov     $ctr0.d[1], $ctr96_b64x
5945	bic     $end_input_ptr, $end_input_ptr, $ctr32x          @ mask out low existing bytes
5946
5947#ifndef __AARCH64EB__
5948	rev     $ctr32w, $rctr32w
5949#else
5950	mov     $ctr32w, $rctr32w
5951#endif
5952
5953	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x      @ mask out high existing bytes
5954
5955	orr     $output_l0, $output_l0, $end_input_ptr
5956
5957	and     $output_h0, $output_h0, $ctr96_b64x
5958
5959	orr     $output_h0, $output_h0, $main_end_input_ptr
5960
5961	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
5962
5963	rev64   $res0b, $res1b                                    @ GHASH final block
5964
5965	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
5966
5967	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
5968
5969	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
5970
5971	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
5972
5973	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
5974
5975	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
5976
5977	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
5978
5979	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
5980
5981	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
5982	movi    $mod_constant.8b, #0xc2
5983
5984	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
5985
5986	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
5987
5988	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
5989
5990	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
5991
5992	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
5993
5994	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
5995
5996	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
5997
5998	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
5999
6000	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
6001
6002	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
6003
6004	stp     $output_l0, $output_h0, [$output_ptr]
6005
6006	str     $ctr32w, [$counter, #12]                          @ store the updated counter
6007
6008	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
6009	ext     $acc_lb, $acc_lb, $acc_lb, #8
6010	rev64   $acc_lb, $acc_lb
6011	mov     x0, $len
6012	st1     { $acc_l.16b }, [$current_tag]
6013
6014	ldp     x21, x22, [sp, #16]
6015	ldp     x23, x24, [sp, #32]
6016	ldp     d8, d9, [sp, #48]
6017	ldp     d10, d11, [sp, #64]
6018	ldp     d12, d13, [sp, #80]
6019	ldp     d14, d15, [sp, #96]
6020	ldp     x19, x20, [sp], #112
6021	ret
6022
6023.L256_dec_ret:
6024	mov w0, #0x0
6025	ret
6026.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
6027___
6028}
6029}
6030
6031$code.=<<___;
6032.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
6033.align  2
6034#endif
6035___
6036
6037if ($flavour =~ /64/) {         ######## 64-bit code
6038    sub unvmov {
6039        my $arg=shift;
6040
6041        $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
6042        sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
6043                             $3<8?$3:$3+8,($4 eq "lo")?0:1;
6044    }
6045    foreach(split("\n",$code)) {
6046        s/@\s/\/\//o;               # old->new style commentary
6047        print $_,"\n";
6048    }
6049} else {                ######## 32-bit code
6050    sub unvdup32 {
6051        my $arg=shift;
6052
6053        $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
6054        sprintf "vdup.32    q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
6055    }
6056    sub unvpmullp64 {
6057        my ($mnemonic,$arg)=@_;
6058
6059        if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
6060            my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
6061                       |(($2&7)<<17)|(($2&8)<<4)
6062                       |(($3&7)<<1) |(($3&8)<<2);
6063            $word |= 0x00010001  if ($mnemonic =~ "2");
6064            # since ARMv7 instructions are always encoded little-endian.
6065            # correct solution is to use .inst directive, but older%%%%
6066            # assemblers don't implement it:-(
6067            sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
6068                    $word&0xff,($word>>8)&0xff,
6069                    ($word>>16)&0xff,($word>>24)&0xff,
6070                    $mnemonic,$arg;
6071        }
6072    }
6073
6074    foreach(split("\n",$code)) {
6075        s/\b[wx]([0-9]+)\b/r$1/go;      # new->old registers
6076        s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
6077        s/\/\/\s?/@ /o;             # new->old style commentary
6078
6079        # fix up remaining new-style suffixes
6080        s/\],#[0-9]+/]!/o;
6081
6082        s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o         or
6083        s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
6084        s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo        or
6085        s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo   or
6086        s/^(\s+)b\./$1b/o                       or
6087        s/^(\s+)ret/$1bx\tlr/o;
6088
6089        if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
6090            print "     it      $2\n";
6091        }
6092        s/__AARCH64E([BL])__/__ARME$1__/go;
6093        print $_,"\n";
6094    }
6095}
6096
6097close STDOUT or die "error closing STDOUT: $!"; # enforce flush
6098