1#! /usr/bin/env perl
2# Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18#
19# June 2014
20#
21# Initial version was developed in tight cooperation with Ard
22# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
23# Just like aesv8-armx.pl this module supports both AArch32 and
24# AArch64 execution modes.
25#
26# July 2014
27#
28# Implement 2x aggregated reduction [see ghash-x86.pl for background
29# information].
30#
31# November 2017
32#
33# AArch64 register bank to "accommodate" 4x aggregated reduction and
34# improve performance by 20-70% depending on processor.
35#
36# Current performance in cycles per processed byte:
37#
38#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
39# Apple A7	0.58		0.92		5.62
40# Cortex-A53	0.85		1.01		8.39
41# Cortex-A57	0.73		1.17		7.61
42# Denver	0.51		0.65		6.02
43# Mongoose	0.65		1.10		8.06
44# Kryo		0.76		1.16		8.00
45# ThunderX2	1.05
46#
47# (*)	presented for reference/comparison purposes;
48
49# $output is the last argument if it looks like a file (it has an extension)
50# $flavour is the first argument if it doesn't look like a file
51$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
52$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
57die "can't locate arm-xlate.pl";
58
59open OUT,"| \"$^X\" $xlate $flavour \"$output\""
60    or die "can't call $xlate: $!";
61*STDOUT=*OUT;
62
63$Xi="x0";	# argument block
64$Htbl="x1";
65$inp="x2";
66$len="x3";
67
68$inc="x12";
69
70{
71my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
72my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
73my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
74
75$code=<<___;
76#include "arm_arch.h"
77
78#if __ARM_MAX_ARCH__>=7
79___
80$code.=".arch	armv8-a+crypto\n.text\n"	if ($flavour =~ /64/);
81$code.=<<___					if ($flavour !~ /64/);
82.fpu	neon
83#ifdef __thumb2__
84.syntax        unified
85.thumb
86# define INST(a,b,c,d) $_byte  c,0xef,a,b
87#else
88.code  32
89# define INST(a,b,c,d) $_byte  a,b,c,0xf2
90#endif
91
92.text
93___
94
95################################################################################
96# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
97#
98# input:	128-bit H - secret parameter E(K,0^128)
99# output:	precomputed table filled with degrees of twisted H;
100#		H is twisted to handle reverse bitness of GHASH;
101#		only few of 16 slots of Htable[16] are used;
102#		data is opaque to outside world (which allows to
103#		optimize the code independently);
104#
105$code.=<<___;
106.global	gcm_init_v8
107.type	gcm_init_v8,%function
108.align	4
109gcm_init_v8:
110___
111$code.=<<___	if ($flavour =~ /64/);
112	AARCH64_VALID_CALL_TARGET
113___
114$code.=<<___;
115	vld1.64		{$t1},[x1]		@ load input H
116	vmov.i8		$xC2,#0xe1
117	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
118	vext.8		$IN,$t1,$t1,#8
119	vshr.u64	$t2,$xC2,#63
120	vdup.32		$t1,${t1}[1]
121	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
122	vshr.u64	$t2,$IN,#63
123	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
124	vand		$t2,$t2,$t0
125	vshl.i64	$IN,$IN,#1
126	vext.8		$t2,$t2,$t2,#8
127	vand		$t0,$t0,$t1
128	vorr		$IN,$IN,$t2		@ H<<<=1
129	veor		$H,$IN,$t0		@ twisted H
130	vst1.64		{$H},[x0],#16		@ store Htable[0]
131
132	@ calculate H^2
133	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
134	vpmull.p64	$Xl,$H,$H
135	veor		$t0,$t0,$H
136	vpmull2.p64	$Xh,$H,$H
137	vpmull.p64	$Xm,$t0,$t0
138
139	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
140	veor		$t2,$Xl,$Xh
141	veor		$Xm,$Xm,$t1
142	veor		$Xm,$Xm,$t2
143	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
144
145	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
146	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
147	veor		$Xl,$Xm,$t2
148
149	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
150	vpmull.p64	$Xl,$Xl,$xC2
151	veor		$t2,$t2,$Xh
152	veor		$H2,$Xl,$t2
153
154	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
155	veor		$t1,$t1,$H2
156	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
157	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
158___
159if ($flavour =~ /64/) {
160my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
161
162$code.=<<___;
163	@ calculate H^3 and H^4
164	vpmull.p64	$Xl,$H, $H2
165	 vpmull.p64	$Yl,$H2,$H2
166	vpmull2.p64	$Xh,$H, $H2
167	 vpmull2.p64	$Yh,$H2,$H2
168	vpmull.p64	$Xm,$t0,$t1
169	 vpmull.p64	$Ym,$t1,$t1
170
171	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
172	 vext.8		$t1,$Yl,$Yh,#8
173	veor		$t2,$Xl,$Xh
174	veor		$Xm,$Xm,$t0
175	 veor		$t3,$Yl,$Yh
176	 veor		$Ym,$Ym,$t1
177	veor		$Xm,$Xm,$t2
178	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
179	 veor		$Ym,$Ym,$t3
180	 vpmull.p64	$t3,$Yl,$xC2
181
182	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
183	 vmov		$Yh#lo,$Ym#hi
184	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
185	 vmov		$Ym#hi,$Yl#lo
186	veor		$Xl,$Xm,$t2
187	 veor		$Yl,$Ym,$t3
188
189	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
190	 vext.8		$t3,$Yl,$Yl,#8
191	vpmull.p64	$Xl,$Xl,$xC2
192	 vpmull.p64	$Yl,$Yl,$xC2
193	veor		$t2,$t2,$Xh
194	 veor		$t3,$t3,$Yh
195	veor		$H, $Xl,$t2		@ H^3
196	 veor		$H2,$Yl,$t3		@ H^4
197
198	vext.8		$t0,$H, $H,#8		@ Karatsuba pre-processing
199	 vext.8		$t1,$H2,$H2,#8
200	veor		$t0,$t0,$H
201	 veor		$t1,$t1,$H2
202	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
203	vst1.64		{$H-$H2},[x0]		@ store Htable[3..5]
204___
205}
206$code.=<<___;
207	ret
208.size	gcm_init_v8,.-gcm_init_v8
209___
210################################################################################
211# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
212#
213# input:	Xi - current hash value;
214#		Htable - table precomputed in gcm_init_v8;
215# output:	Xi - next hash value Xi;
216#
217$code.=<<___;
218.global	gcm_gmult_v8
219.type	gcm_gmult_v8,%function
220.align	4
221gcm_gmult_v8:
222___
223$code.=<<___	if ($flavour =~ /64/);
224	AARCH64_VALID_CALL_TARGET
225___
226$code.=<<___;
227	vld1.64		{$t1},[$Xi]		@ load Xi
228	vmov.i8		$xC2,#0xe1
229	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
230	vshl.u64	$xC2,$xC2,#57
231#ifndef __ARMEB__
232	vrev64.8	$t1,$t1
233#endif
234	vext.8		$IN,$t1,$t1,#8
235
236	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
237	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
238	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
239	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
240
241	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
242	veor		$t2,$Xl,$Xh
243	veor		$Xm,$Xm,$t1
244	veor		$Xm,$Xm,$t2
245	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
246
247	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
248	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
249	veor		$Xl,$Xm,$t2
250
251	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
252	vpmull.p64	$Xl,$Xl,$xC2
253	veor		$t2,$t2,$Xh
254	veor		$Xl,$Xl,$t2
255
256#ifndef __ARMEB__
257	vrev64.8	$Xl,$Xl
258#endif
259	vext.8		$Xl,$Xl,$Xl,#8
260	vst1.64		{$Xl},[$Xi]		@ write out Xi
261
262	ret
263.size	gcm_gmult_v8,.-gcm_gmult_v8
264___
265################################################################################
266# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
267#
268# input:	table precomputed in gcm_init_v8;
269#		current hash value Xi;
270#		pointer to input data;
271#		length of input data in bytes, but divisible by block size;
272# output:	next hash value Xi;
273#
274$code.=<<___;
275.global	gcm_ghash_v8
276.type	gcm_ghash_v8,%function
277.align	4
278gcm_ghash_v8:
279___
280$code.=<<___	if ($flavour =~ /64/);
281	AARCH64_VALID_CALL_TARGET
282	cmp		$len,#64
283	b.hs		.Lgcm_ghash_v8_4x
284___
285$code.=<<___		if ($flavour !~ /64/);
286	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
287___
288$code.=<<___;
289	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
290						@ "[rotated]" means that
291						@ loaded value would have
292						@ to be rotated in order to
293						@ make it appear as in
294						@ algorithm specification
295	subs		$len,$len,#32		@ see if $len is 32 or larger
296	mov		$inc,#16		@ $inc is used as post-
297						@ increment for input pointer;
298						@ as loop is modulo-scheduled
299						@ $inc is zeroed just in time
300						@ to preclude overstepping
301						@ inp[len], which means that
302						@ last block[s] are actually
303						@ loaded twice, but last
304						@ copy is not processed
305	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
306	vmov.i8		$xC2,#0xe1
307	vld1.64		{$H2},[$Htbl]
308	cclr		$inc,eq			@ is it time to zero $inc?
309	vext.8		$Xl,$Xl,$Xl,#8		@ rotate Xi
310	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
311	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
312#ifndef __ARMEB__
313	vrev64.8	$t0,$t0
314	vrev64.8	$Xl,$Xl
315#endif
316	vext.8		$IN,$t0,$t0,#8		@ rotate I[0]
317	b.lo		.Lodd_tail_v8		@ $len was less than 32
318___
319{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
320	#######
321	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
322	#	[(H*Ii+1) + (H*Xi+1)] mod P =
323	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
324	#
325$code.=<<___;
326	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
327#ifndef __ARMEB__
328	vrev64.8	$t1,$t1
329#endif
330	vext.8		$In,$t1,$t1,#8
331	veor		$IN,$IN,$Xl		@ I[i]^=Xi
332	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
333	veor		$t1,$t1,$In		@ Karatsuba pre-processing
334	vpmull2.p64	$Xhn,$H,$In
335	b		.Loop_mod2x_v8
336
337.align	4
338.Loop_mod2x_v8:
339	vext.8		$t2,$IN,$IN,#8
340	subs		$len,$len,#32		@ is there more data?
341	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
342	cclr		$inc,lo			@ is it time to zero $inc?
343
344	 vpmull.p64	$Xmn,$Hhl,$t1
345	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
346	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
347	veor		$Xl,$Xl,$Xln		@ accumulate
348	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
349	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i+2]
350
351	veor		$Xh,$Xh,$Xhn
352	 cclr		$inc,eq			@ is it time to zero $inc?
353	veor		$Xm,$Xm,$Xmn
354
355	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
356	veor		$t2,$Xl,$Xh
357	veor		$Xm,$Xm,$t1
358	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+3]
359#ifndef __ARMEB__
360	 vrev64.8	$t0,$t0
361#endif
362	veor		$Xm,$Xm,$t2
363	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
364
365#ifndef __ARMEB__
366	 vrev64.8	$t1,$t1
367#endif
368	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
369	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
370	 vext.8		$In,$t1,$t1,#8
371	 vext.8		$IN,$t0,$t0,#8
372	veor		$Xl,$Xm,$t2
373	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
374	veor		$IN,$IN,$Xh		@ accumulate $IN early
375
376	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
377	vpmull.p64	$Xl,$Xl,$xC2
378	veor		$IN,$IN,$t2
379	 veor		$t1,$t1,$In		@ Karatsuba pre-processing
380	veor		$IN,$IN,$Xl
381	 vpmull2.p64	$Xhn,$H,$In
382	b.hs		.Loop_mod2x_v8		@ there was at least 32 more bytes
383
384	veor		$Xh,$Xh,$t2
385	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
386	adds		$len,$len,#32		@ re-construct $len
387	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
388	b.eq		.Ldone_v8		@ is $len zero?
389___
390}
391$code.=<<___;
392.Lodd_tail_v8:
393	vext.8		$t2,$Xl,$Xl,#8
394	veor		$IN,$IN,$Xl		@ inp^=Xi
395	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
396
397	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
398	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
399	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
400	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
401
402	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
403	veor		$t2,$Xl,$Xh
404	veor		$Xm,$Xm,$t1
405	veor		$Xm,$Xm,$t2
406	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
407
408	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
409	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
410	veor		$Xl,$Xm,$t2
411
412	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
413	vpmull.p64	$Xl,$Xl,$xC2
414	veor		$t2,$t2,$Xh
415	veor		$Xl,$Xl,$t2
416
417.Ldone_v8:
418#ifndef __ARMEB__
419	vrev64.8	$Xl,$Xl
420#endif
421	vext.8		$Xl,$Xl,$Xl,#8
422	vst1.64		{$Xl},[$Xi]		@ write out Xi
423
424___
425$code.=<<___		if ($flavour !~ /64/);
426	vldmia		sp!,{d8-d15}		@ 32-bit ABI says so
427___
428$code.=<<___;
429	ret
430.size	gcm_ghash_v8,.-gcm_ghash_v8
431___
432
433if ($flavour =~ /64/) {				# 4x subroutine
434my ($I0,$j1,$j2,$j3,
435    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
436
437$code.=<<___;
438.type	gcm_ghash_v8_4x,%function
439.align	4
440gcm_ghash_v8_4x:
441.Lgcm_ghash_v8_4x:
442	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
443	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
444	vmov.i8		$xC2,#0xe1
445	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
446	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
447
448	vld1.64		{$I0-$j3},[$inp],#64
449#ifndef __ARMEB__
450	vrev64.8	$Xl,$Xl
451	vrev64.8	$j1,$j1
452	vrev64.8	$j2,$j2
453	vrev64.8	$j3,$j3
454	vrev64.8	$I0,$I0
455#endif
456	vext.8		$I3,$j3,$j3,#8
457	vext.8		$I2,$j2,$j2,#8
458	vext.8		$I1,$j1,$j1,#8
459
460	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
461	veor		$j3,$j3,$I3
462	vpmull2.p64	$Yh,$H,$I3
463	vpmull.p64	$Ym,$Hhl,$j3
464
465	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
466	veor		$j2,$j2,$I2
467	vpmull2.p64	$I2,$H2,$I2
468	vpmull2.p64	$j2,$Hhl,$j2
469
470	veor		$Yl,$Yl,$t0
471	veor		$Yh,$Yh,$I2
472	veor		$Ym,$Ym,$j2
473
474	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
475	veor		$j1,$j1,$I1
476	vpmull2.p64	$I1,$H3,$I1
477	vpmull.p64	$j1,$H34,$j1
478
479	veor		$Yl,$Yl,$j3
480	veor		$Yh,$Yh,$I1
481	veor		$Ym,$Ym,$j1
482
483	subs		$len,$len,#128
484	b.lo		.Ltail4x
485
486	b		.Loop4x
487
488.align	4
489.Loop4x:
490	veor		$t0,$I0,$Xl
491	 vld1.64	{$I0-$j3},[$inp],#64
492	vext.8		$IN,$t0,$t0,#8
493#ifndef __ARMEB__
494	 vrev64.8	$j1,$j1
495	 vrev64.8	$j2,$j2
496	 vrev64.8	$j3,$j3
497	 vrev64.8	$I0,$I0
498#endif
499
500	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
501	veor		$t0,$t0,$IN
502	vpmull2.p64	$Xh,$H4,$IN
503	 vext.8		$I3,$j3,$j3,#8
504	vpmull2.p64	$Xm,$H34,$t0
505
506	veor		$Xl,$Xl,$Yl
507	veor		$Xh,$Xh,$Yh
508	 vext.8		$I2,$j2,$j2,#8
509	veor		$Xm,$Xm,$Ym
510	 vext.8		$I1,$j1,$j1,#8
511
512	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
513	veor		$t2,$Xl,$Xh
514	 vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
515	 veor		$j3,$j3,$I3
516	veor		$Xm,$Xm,$t1
517	 vpmull2.p64	$Yh,$H,$I3
518	veor		$Xm,$Xm,$t2
519	 vpmull.p64	$Ym,$Hhl,$j3
520
521	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
522	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
523	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
524	 vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
525	 veor		$j2,$j2,$I2
526	 vpmull2.p64	$I2,$H2,$I2
527	veor		$Xl,$Xm,$t2
528	 vpmull2.p64	$j2,$Hhl,$j2
529
530	 veor		$Yl,$Yl,$t0
531	 veor		$Yh,$Yh,$I2
532	 veor		$Ym,$Ym,$j2
533
534	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
535	vpmull.p64	$Xl,$Xl,$xC2
536	 vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
537	 veor		$j1,$j1,$I1
538	veor		$t2,$t2,$Xh
539	 vpmull2.p64	$I1,$H3,$I1
540	 vpmull.p64	$j1,$H34,$j1
541
542	veor		$Xl,$Xl,$t2
543	 veor		$Yl,$Yl,$j3
544	 veor		$Yh,$Yh,$I1
545	vext.8		$Xl,$Xl,$Xl,#8
546	 veor		$Ym,$Ym,$j1
547
548	subs		$len,$len,#64
549	b.hs		.Loop4x
550
551.Ltail4x:
552	veor		$t0,$I0,$Xl
553	vext.8		$IN,$t0,$t0,#8
554
555	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
556	veor		$t0,$t0,$IN
557	vpmull2.p64	$Xh,$H4,$IN
558	vpmull2.p64	$Xm,$H34,$t0
559
560	veor		$Xl,$Xl,$Yl
561	veor		$Xh,$Xh,$Yh
562	veor		$Xm,$Xm,$Ym
563
564	adds		$len,$len,#64
565	b.eq		.Ldone4x
566
567	cmp		$len,#32
568	b.lo		.Lone
569	b.eq		.Ltwo
570.Lthree:
571	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
572	veor		$t2,$Xl,$Xh
573	veor		$Xm,$Xm,$t1
574	 vld1.64	{$I0-$j2},[$inp]
575	veor		$Xm,$Xm,$t2
576#ifndef	__ARMEB__
577	 vrev64.8	$j1,$j1
578	 vrev64.8	$j2,$j2
579	 vrev64.8	$I0,$I0
580#endif
581
582	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
583	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
584	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
585	 vext.8		$I2,$j2,$j2,#8
586	 vext.8		$I1,$j1,$j1,#8
587	veor		$Xl,$Xm,$t2
588
589	 vpmull.p64	$Yl,$H,$I2		@ H·Ii+2
590	 veor		$j2,$j2,$I2
591
592	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
593	vpmull.p64	$Xl,$Xl,$xC2
594	veor		$t2,$t2,$Xh
595	 vpmull2.p64	$Yh,$H,$I2
596	 vpmull.p64	$Ym,$Hhl,$j2
597	veor		$Xl,$Xl,$t2
598	 vpmull.p64	$j3,$H2,$I1		@ H^2·Ii+1
599	 veor		$j1,$j1,$I1
600	vext.8		$Xl,$Xl,$Xl,#8
601
602	 vpmull2.p64	$I1,$H2,$I1
603	veor		$t0,$I0,$Xl
604	 vpmull2.p64	$j1,$Hhl,$j1
605	vext.8		$IN,$t0,$t0,#8
606
607	 veor		$Yl,$Yl,$j3
608	 veor		$Yh,$Yh,$I1
609	 veor		$Ym,$Ym,$j1
610
611	vpmull.p64	$Xl,$H3,$IN		@ H^3·(Xi+Ii)
612	veor		$t0,$t0,$IN
613	vpmull2.p64	$Xh,$H3,$IN
614	vpmull.p64	$Xm,$H34,$t0
615
616	veor		$Xl,$Xl,$Yl
617	veor		$Xh,$Xh,$Yh
618	veor		$Xm,$Xm,$Ym
619	b		.Ldone4x
620
621.align	4
622.Ltwo:
623	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
624	veor		$t2,$Xl,$Xh
625	veor		$Xm,$Xm,$t1
626	 vld1.64	{$I0-$j1},[$inp]
627	veor		$Xm,$Xm,$t2
628#ifndef	__ARMEB__
629	 vrev64.8	$j1,$j1
630	 vrev64.8	$I0,$I0
631#endif
632
633	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
634	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
635	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
636	 vext.8		$I1,$j1,$j1,#8
637	veor		$Xl,$Xm,$t2
638
639	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
640	vpmull.p64	$Xl,$Xl,$xC2
641	veor		$t2,$t2,$Xh
642	veor		$Xl,$Xl,$t2
643	vext.8		$Xl,$Xl,$Xl,#8
644
645	 vpmull.p64	$Yl,$H,$I1		@ H·Ii+1
646	 veor		$j1,$j1,$I1
647
648	veor		$t0,$I0,$Xl
649	vext.8		$IN,$t0,$t0,#8
650
651	 vpmull2.p64	$Yh,$H,$I1
652	 vpmull.p64	$Ym,$Hhl,$j1
653
654	vpmull.p64	$Xl,$H2,$IN		@ H^2·(Xi+Ii)
655	veor		$t0,$t0,$IN
656	vpmull2.p64	$Xh,$H2,$IN
657	vpmull2.p64	$Xm,$Hhl,$t0
658
659	veor		$Xl,$Xl,$Yl
660	veor		$Xh,$Xh,$Yh
661	veor		$Xm,$Xm,$Ym
662	b		.Ldone4x
663
664.align	4
665.Lone:
666	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
667	veor		$t2,$Xl,$Xh
668	veor		$Xm,$Xm,$t1
669	 vld1.64	{$I0},[$inp]
670	veor		$Xm,$Xm,$t2
671#ifndef	__ARMEB__
672	 vrev64.8	$I0,$I0
673#endif
674
675	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
676	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
677	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
678	veor		$Xl,$Xm,$t2
679
680	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
681	vpmull.p64	$Xl,$Xl,$xC2
682	veor		$t2,$t2,$Xh
683	veor		$Xl,$Xl,$t2
684	vext.8		$Xl,$Xl,$Xl,#8
685
686	veor		$t0,$I0,$Xl
687	vext.8		$IN,$t0,$t0,#8
688
689	vpmull.p64	$Xl,$H,$IN
690	veor		$t0,$t0,$IN
691	vpmull2.p64	$Xh,$H,$IN
692	vpmull.p64	$Xm,$Hhl,$t0
693
694.Ldone4x:
695	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
696	veor		$t2,$Xl,$Xh
697	veor		$Xm,$Xm,$t1
698	veor		$Xm,$Xm,$t2
699
700	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
701	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
702	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
703	veor		$Xl,$Xm,$t2
704
705	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
706	vpmull.p64	$Xl,$Xl,$xC2
707	veor		$t2,$t2,$Xh
708	veor		$Xl,$Xl,$t2
709	vext.8		$Xl,$Xl,$Xl,#8
710
711#ifndef __ARMEB__
712	vrev64.8	$Xl,$Xl
713#endif
714	vst1.64		{$Xl},[$Xi]		@ write out Xi
715
716	ret
717.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
718___
719
720}
721}
722
723$code.=<<___;
724.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
725.align  2
726#endif
727___
728
729if ($flavour =~ /64/) {			######## 64-bit code
730    sub unvmov {
731	my $arg=shift;
732
733	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
734	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
735					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
736    }
737    foreach(split("\n",$code)) {
738	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
739	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
740	s/vmov\s+(.*)/unvmov($1)/geo	or
741	s/vext\.8/ext/o			or
742	s/vshr\.s/sshr\.s/o		or
743	s/vshr/ushr/o			or
744	s/^(\s+)v/$1/o			or	# strip off v prefix
745	s/\bbx\s+lr\b/ret/o;
746
747	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
748	s/@\s/\/\//o;				# old->new style commentary
749
750	# fix up remaining legacy suffixes
751	s/\.[ui]?8(\s)/$1/o;
752	s/\.[uis]?32//o and s/\.16b/\.4s/go;
753	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
754	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
755	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
756	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
757
758	# Switch preprocessor checks to aarch64 versions.
759	s/__ARME([BL])__/__AARCH64E$1__/go;
760
761	print $_,"\n";
762    }
763} else {				######## 32-bit code
764    sub unvdup32 {
765	my $arg=shift;
766
767	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
768	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
769    }
770    sub unvpmullp64 {
771	my ($mnemonic,$arg)=@_;
772
773	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
774	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
775				 |(($2&7)<<17)|(($2&8)<<4)
776				 |(($3&7)<<1) |(($3&8)<<2);
777	    $word |= 0x00010001	 if ($mnemonic =~ "2");
778	    # since ARMv7 instructions are always encoded little-endian.
779	    # correct solution is to use .inst directive, but older
780	    # assemblers don't implement it:-(
781	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
782			$word&0xff,($word>>8)&0xff,
783			($word>>16)&0xff,($word>>24)&0xff,
784			$mnemonic,$arg;
785	}
786    }
787
788    foreach(split("\n",$code)) {
789	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
790	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
791	s/\/\/\s?/@ /o;				# new->old style commentary
792
793	# fix up remaining new-style suffixes
794	s/\],#[0-9]+/]!/o;
795
796	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o			or
797	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
798	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
799	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
800	s/^(\s+)b\./$1b/o						or
801	s/^(\s+)ret/$1bx\tlr/o;
802
803	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
804	    print "     it      $2\n";
805	}
806
807	print $_,"\n";
808    }
809}
810
811close STDOUT or die "error closing STDOUT: $!"; # enforce flush
812