1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18#
19# June 2014
20#
21# Initial version was developed in tight cooperation with Ard
22# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
23# Just like aesv8-armx.pl this module supports both AArch32 and
24# AArch64 execution modes.
25#
26# July 2014
27#
28# Implement 2x aggregated reduction [see ghash-x86.pl for background
29# information].
30#
31# November 2017
32#
33# AArch64 register bank to "accommodate" 4x aggregated reduction and
34# improve performance by 20-70% depending on processor.
35#
36# Current performance in cycles per processed byte:
37#
38#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
39# Apple A7	0.58		0.92		5.62
40# Cortex-A53	0.85		1.01		8.39
41# Cortex-A57	0.73		1.17		7.61
42# Denver	0.51		0.65		6.02
43# Mongoose	0.65		1.10		8.06
44# Kryo		0.76		1.16		8.00
45# ThunderX2	1.05
46#
47# (*)	presented for reference/comparison purposes;
48
49# $output is the last argument if it looks like a file (it has an extension)
50# $flavour is the first argument if it doesn't look like a file
51$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
52$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
57die "can't locate arm-xlate.pl";
58
59open OUT,"| \"$^X\" $xlate $flavour \"$output\""
60    or die "can't call $xlate: $!";
61*STDOUT=*OUT;
62
63$Xi="x0";	# argument block
64$Htbl="x1";
65$inp="x2";
66$len="x3";
67
68$inc="x12";
69
70{
71my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
72my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
73my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
74
75$code=<<___;
76#include "arm_arch.h"
77
78#if __ARM_MAX_ARCH__>=7
79___
80$code.=".arch	armv8-a+crypto\n.text\n"	if ($flavour =~ /64/);
81$code.=<<___					if ($flavour !~ /64/);
82.fpu	neon
83#ifdef __thumb2__
84.syntax        unified
85.thumb
86# define INST(a,b,c,d) $_byte  c,0xef,a,b
87#else
88.code  32
89# define INST(a,b,c,d) $_byte  a,b,c,0xf2
90#endif
91
92.text
93___
94
95################################################################################
96# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
97#
98# input:	128-bit H - secret parameter E(K,0^128)
99# output:	precomputed table filled with degrees of twisted H;
100#		H is twisted to handle reverse bitness of GHASH;
101#		only few of 16 slots of Htable[16] are used;
102#		data is opaque to outside world (which allows to
103#		optimize the code independently);
104#
105$code.=<<___;
106.global	gcm_init_v8
107.type	gcm_init_v8,%function
108.align	4
109gcm_init_v8:
110	vld1.64		{$t1},[x1]		@ load input H
111	vmov.i8		$xC2,#0xe1
112	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
113	vext.8		$IN,$t1,$t1,#8
114	vshr.u64	$t2,$xC2,#63
115	vdup.32		$t1,${t1}[1]
116	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
117	vshr.u64	$t2,$IN,#63
118	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
119	vand		$t2,$t2,$t0
120	vshl.i64	$IN,$IN,#1
121	vext.8		$t2,$t2,$t2,#8
122	vand		$t0,$t0,$t1
123	vorr		$IN,$IN,$t2		@ H<<<=1
124	veor		$H,$IN,$t0		@ twisted H
125	vst1.64		{$H},[x0],#16		@ store Htable[0]
126
127	@ calculate H^2
128	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
129	vpmull.p64	$Xl,$H,$H
130	veor		$t0,$t0,$H
131	vpmull2.p64	$Xh,$H,$H
132	vpmull.p64	$Xm,$t0,$t0
133
134	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
135	veor		$t2,$Xl,$Xh
136	veor		$Xm,$Xm,$t1
137	veor		$Xm,$Xm,$t2
138	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
139
140	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
141	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
142	veor		$Xl,$Xm,$t2
143
144	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
145	vpmull.p64	$Xl,$Xl,$xC2
146	veor		$t2,$t2,$Xh
147	veor		$H2,$Xl,$t2
148
149	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
150	veor		$t1,$t1,$H2
151	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
152	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
153___
154if ($flavour =~ /64/) {
155my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
156
157$code.=<<___;
158	@ calculate H^3 and H^4
159	vpmull.p64	$Xl,$H, $H2
160	 vpmull.p64	$Yl,$H2,$H2
161	vpmull2.p64	$Xh,$H, $H2
162	 vpmull2.p64	$Yh,$H2,$H2
163	vpmull.p64	$Xm,$t0,$t1
164	 vpmull.p64	$Ym,$t1,$t1
165
166	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
167	 vext.8		$t1,$Yl,$Yh,#8
168	veor		$t2,$Xl,$Xh
169	veor		$Xm,$Xm,$t0
170	 veor		$t3,$Yl,$Yh
171	 veor		$Ym,$Ym,$t1
172	veor		$Xm,$Xm,$t2
173	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
174	 veor		$Ym,$Ym,$t3
175	 vpmull.p64	$t3,$Yl,$xC2
176
177	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
178	 vmov		$Yh#lo,$Ym#hi
179	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
180	 vmov		$Ym#hi,$Yl#lo
181	veor		$Xl,$Xm,$t2
182	 veor		$Yl,$Ym,$t3
183
184	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
185	 vext.8		$t3,$Yl,$Yl,#8
186	vpmull.p64	$Xl,$Xl,$xC2
187	 vpmull.p64	$Yl,$Yl,$xC2
188	veor		$t2,$t2,$Xh
189	 veor		$t3,$t3,$Yh
190	veor		$H, $Xl,$t2		@ H^3
191	 veor		$H2,$Yl,$t3		@ H^4
192
193	vext.8		$t0,$H, $H,#8		@ Karatsuba pre-processing
194	 vext.8		$t1,$H2,$H2,#8
195	veor		$t0,$t0,$H
196	 veor		$t1,$t1,$H2
197	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
198	vst1.64		{$H-$H2},[x0]		@ store Htable[3..5]
199___
200}
201$code.=<<___;
202	ret
203.size	gcm_init_v8,.-gcm_init_v8
204___
205################################################################################
206# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
207#
208# input:	Xi - current hash value;
209#		Htable - table precomputed in gcm_init_v8;
210# output:	Xi - next hash value Xi;
211#
212$code.=<<___;
213.global	gcm_gmult_v8
214.type	gcm_gmult_v8,%function
215.align	4
216gcm_gmult_v8:
217	vld1.64		{$t1},[$Xi]		@ load Xi
218	vmov.i8		$xC2,#0xe1
219	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
220	vshl.u64	$xC2,$xC2,#57
221#ifndef __ARMEB__
222	vrev64.8	$t1,$t1
223#endif
224	vext.8		$IN,$t1,$t1,#8
225
226	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
227	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
228	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
229	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
230
231	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
232	veor		$t2,$Xl,$Xh
233	veor		$Xm,$Xm,$t1
234	veor		$Xm,$Xm,$t2
235	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
236
237	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
238	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
239	veor		$Xl,$Xm,$t2
240
241	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
242	vpmull.p64	$Xl,$Xl,$xC2
243	veor		$t2,$t2,$Xh
244	veor		$Xl,$Xl,$t2
245
246#ifndef __ARMEB__
247	vrev64.8	$Xl,$Xl
248#endif
249	vext.8		$Xl,$Xl,$Xl,#8
250	vst1.64		{$Xl},[$Xi]		@ write out Xi
251
252	ret
253.size	gcm_gmult_v8,.-gcm_gmult_v8
254___
255################################################################################
256# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
257#
258# input:	table precomputed in gcm_init_v8;
259#		current hash value Xi;
260#		pointer to input data;
261#		length of input data in bytes, but divisible by block size;
262# output:	next hash value Xi;
263#
264$code.=<<___;
265.global	gcm_ghash_v8
266.type	gcm_ghash_v8,%function
267.align	4
268gcm_ghash_v8:
269___
270$code.=<<___	if ($flavour =~ /64/);
271	cmp		$len,#64
272	b.hs		.Lgcm_ghash_v8_4x
273___
274$code.=<<___		if ($flavour !~ /64/);
275	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
276___
277$code.=<<___;
278	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
279						@ "[rotated]" means that
280						@ loaded value would have
281						@ to be rotated in order to
282						@ make it appear as in
283						@ algorithm specification
284	subs		$len,$len,#32		@ see if $len is 32 or larger
285	mov		$inc,#16		@ $inc is used as post-
286						@ increment for input pointer;
287						@ as loop is modulo-scheduled
288						@ $inc is zeroed just in time
289						@ to preclude overstepping
290						@ inp[len], which means that
291						@ last block[s] are actually
292						@ loaded twice, but last
293						@ copy is not processed
294	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
295	vmov.i8		$xC2,#0xe1
296	vld1.64		{$H2},[$Htbl]
297	cclr		$inc,eq			@ is it time to zero $inc?
298	vext.8		$Xl,$Xl,$Xl,#8		@ rotate Xi
299	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
300	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
301#ifndef __ARMEB__
302	vrev64.8	$t0,$t0
303	vrev64.8	$Xl,$Xl
304#endif
305	vext.8		$IN,$t0,$t0,#8		@ rotate I[0]
306	b.lo		.Lodd_tail_v8		@ $len was less than 32
307___
308{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
309	#######
310	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
311	#	[(H*Ii+1) + (H*Xi+1)] mod P =
312	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
313	#
314$code.=<<___;
315	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
316#ifndef __ARMEB__
317	vrev64.8	$t1,$t1
318#endif
319	vext.8		$In,$t1,$t1,#8
320	veor		$IN,$IN,$Xl		@ I[i]^=Xi
321	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
322	veor		$t1,$t1,$In		@ Karatsuba pre-processing
323	vpmull2.p64	$Xhn,$H,$In
324	b		.Loop_mod2x_v8
325
326.align	4
327.Loop_mod2x_v8:
328	vext.8		$t2,$IN,$IN,#8
329	subs		$len,$len,#32		@ is there more data?
330	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
331	cclr		$inc,lo			@ is it time to zero $inc?
332
333	 vpmull.p64	$Xmn,$Hhl,$t1
334	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
335	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
336	veor		$Xl,$Xl,$Xln		@ accumulate
337	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
338	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i+2]
339
340	veor		$Xh,$Xh,$Xhn
341	 cclr		$inc,eq			@ is it time to zero $inc?
342	veor		$Xm,$Xm,$Xmn
343
344	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
345	veor		$t2,$Xl,$Xh
346	veor		$Xm,$Xm,$t1
347	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+3]
348#ifndef __ARMEB__
349	 vrev64.8	$t0,$t0
350#endif
351	veor		$Xm,$Xm,$t2
352	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
353
354#ifndef __ARMEB__
355	 vrev64.8	$t1,$t1
356#endif
357	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
358	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
359	 vext.8		$In,$t1,$t1,#8
360	 vext.8		$IN,$t0,$t0,#8
361	veor		$Xl,$Xm,$t2
362	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
363	veor		$IN,$IN,$Xh		@ accumulate $IN early
364
365	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
366	vpmull.p64	$Xl,$Xl,$xC2
367	veor		$IN,$IN,$t2
368	 veor		$t1,$t1,$In		@ Karatsuba pre-processing
369	veor		$IN,$IN,$Xl
370	 vpmull2.p64	$Xhn,$H,$In
371	b.hs		.Loop_mod2x_v8		@ there was at least 32 more bytes
372
373	veor		$Xh,$Xh,$t2
374	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
375	adds		$len,$len,#32		@ re-construct $len
376	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
377	b.eq		.Ldone_v8		@ is $len zero?
378___
379}
380$code.=<<___;
381.Lodd_tail_v8:
382	vext.8		$t2,$Xl,$Xl,#8
383	veor		$IN,$IN,$Xl		@ inp^=Xi
384	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
385
386	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
387	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
388	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
389	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
390
391	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
392	veor		$t2,$Xl,$Xh
393	veor		$Xm,$Xm,$t1
394	veor		$Xm,$Xm,$t2
395	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
396
397	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
398	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
399	veor		$Xl,$Xm,$t2
400
401	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
402	vpmull.p64	$Xl,$Xl,$xC2
403	veor		$t2,$t2,$Xh
404	veor		$Xl,$Xl,$t2
405
406.Ldone_v8:
407#ifndef __ARMEB__
408	vrev64.8	$Xl,$Xl
409#endif
410	vext.8		$Xl,$Xl,$Xl,#8
411	vst1.64		{$Xl},[$Xi]		@ write out Xi
412
413___
414$code.=<<___		if ($flavour !~ /64/);
415	vldmia		sp!,{d8-d15}		@ 32-bit ABI says so
416___
417$code.=<<___;
418	ret
419.size	gcm_ghash_v8,.-gcm_ghash_v8
420___
421
422if ($flavour =~ /64/) {				# 4x subroutine
423my ($I0,$j1,$j2,$j3,
424    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
425
426$code.=<<___;
427.type	gcm_ghash_v8_4x,%function
428.align	4
429gcm_ghash_v8_4x:
430.Lgcm_ghash_v8_4x:
431	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
432	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
433	vmov.i8		$xC2,#0xe1
434	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
435	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
436
437	vld1.64		{$I0-$j3},[$inp],#64
438#ifndef __ARMEB__
439	vrev64.8	$Xl,$Xl
440	vrev64.8	$j1,$j1
441	vrev64.8	$j2,$j2
442	vrev64.8	$j3,$j3
443	vrev64.8	$I0,$I0
444#endif
445	vext.8		$I3,$j3,$j3,#8
446	vext.8		$I2,$j2,$j2,#8
447	vext.8		$I1,$j1,$j1,#8
448
449	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
450	veor		$j3,$j3,$I3
451	vpmull2.p64	$Yh,$H,$I3
452	vpmull.p64	$Ym,$Hhl,$j3
453
454	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
455	veor		$j2,$j2,$I2
456	vpmull2.p64	$I2,$H2,$I2
457	vpmull2.p64	$j2,$Hhl,$j2
458
459	veor		$Yl,$Yl,$t0
460	veor		$Yh,$Yh,$I2
461	veor		$Ym,$Ym,$j2
462
463	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
464	veor		$j1,$j1,$I1
465	vpmull2.p64	$I1,$H3,$I1
466	vpmull.p64	$j1,$H34,$j1
467
468	veor		$Yl,$Yl,$j3
469	veor		$Yh,$Yh,$I1
470	veor		$Ym,$Ym,$j1
471
472	subs		$len,$len,#128
473	b.lo		.Ltail4x
474
475	b		.Loop4x
476
477.align	4
478.Loop4x:
479	veor		$t0,$I0,$Xl
480	 vld1.64	{$I0-$j3},[$inp],#64
481	vext.8		$IN,$t0,$t0,#8
482#ifndef __ARMEB__
483	 vrev64.8	$j1,$j1
484	 vrev64.8	$j2,$j2
485	 vrev64.8	$j3,$j3
486	 vrev64.8	$I0,$I0
487#endif
488
489	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
490	veor		$t0,$t0,$IN
491	vpmull2.p64	$Xh,$H4,$IN
492	 vext.8		$I3,$j3,$j3,#8
493	vpmull2.p64	$Xm,$H34,$t0
494
495	veor		$Xl,$Xl,$Yl
496	veor		$Xh,$Xh,$Yh
497	 vext.8		$I2,$j2,$j2,#8
498	veor		$Xm,$Xm,$Ym
499	 vext.8		$I1,$j1,$j1,#8
500
501	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
502	veor		$t2,$Xl,$Xh
503	 vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
504	 veor		$j3,$j3,$I3
505	veor		$Xm,$Xm,$t1
506	 vpmull2.p64	$Yh,$H,$I3
507	veor		$Xm,$Xm,$t2
508	 vpmull.p64	$Ym,$Hhl,$j3
509
510	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
511	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
512	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
513	 vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
514	 veor		$j2,$j2,$I2
515	 vpmull2.p64	$I2,$H2,$I2
516	veor		$Xl,$Xm,$t2
517	 vpmull2.p64	$j2,$Hhl,$j2
518
519	 veor		$Yl,$Yl,$t0
520	 veor		$Yh,$Yh,$I2
521	 veor		$Ym,$Ym,$j2
522
523	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
524	vpmull.p64	$Xl,$Xl,$xC2
525	 vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
526	 veor		$j1,$j1,$I1
527	veor		$t2,$t2,$Xh
528	 vpmull2.p64	$I1,$H3,$I1
529	 vpmull.p64	$j1,$H34,$j1
530
531	veor		$Xl,$Xl,$t2
532	 veor		$Yl,$Yl,$j3
533	 veor		$Yh,$Yh,$I1
534	vext.8		$Xl,$Xl,$Xl,#8
535	 veor		$Ym,$Ym,$j1
536
537	subs		$len,$len,#64
538	b.hs		.Loop4x
539
540.Ltail4x:
541	veor		$t0,$I0,$Xl
542	vext.8		$IN,$t0,$t0,#8
543
544	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
545	veor		$t0,$t0,$IN
546	vpmull2.p64	$Xh,$H4,$IN
547	vpmull2.p64	$Xm,$H34,$t0
548
549	veor		$Xl,$Xl,$Yl
550	veor		$Xh,$Xh,$Yh
551	veor		$Xm,$Xm,$Ym
552
553	adds		$len,$len,#64
554	b.eq		.Ldone4x
555
556	cmp		$len,#32
557	b.lo		.Lone
558	b.eq		.Ltwo
559.Lthree:
560	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
561	veor		$t2,$Xl,$Xh
562	veor		$Xm,$Xm,$t1
563	 vld1.64	{$I0-$j2},[$inp]
564	veor		$Xm,$Xm,$t2
565#ifndef	__ARMEB__
566	 vrev64.8	$j1,$j1
567	 vrev64.8	$j2,$j2
568	 vrev64.8	$I0,$I0
569#endif
570
571	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
572	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
573	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
574	 vext.8		$I2,$j2,$j2,#8
575	 vext.8		$I1,$j1,$j1,#8
576	veor		$Xl,$Xm,$t2
577
578	 vpmull.p64	$Yl,$H,$I2		@ H·Ii+2
579	 veor		$j2,$j2,$I2
580
581	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
582	vpmull.p64	$Xl,$Xl,$xC2
583	veor		$t2,$t2,$Xh
584	 vpmull2.p64	$Yh,$H,$I2
585	 vpmull.p64	$Ym,$Hhl,$j2
586	veor		$Xl,$Xl,$t2
587	 vpmull.p64	$j3,$H2,$I1		@ H^2·Ii+1
588	 veor		$j1,$j1,$I1
589	vext.8		$Xl,$Xl,$Xl,#8
590
591	 vpmull2.p64	$I1,$H2,$I1
592	veor		$t0,$I0,$Xl
593	 vpmull2.p64	$j1,$Hhl,$j1
594	vext.8		$IN,$t0,$t0,#8
595
596	 veor		$Yl,$Yl,$j3
597	 veor		$Yh,$Yh,$I1
598	 veor		$Ym,$Ym,$j1
599
600	vpmull.p64	$Xl,$H3,$IN		@ H^3·(Xi+Ii)
601	veor		$t0,$t0,$IN
602	vpmull2.p64	$Xh,$H3,$IN
603	vpmull.p64	$Xm,$H34,$t0
604
605	veor		$Xl,$Xl,$Yl
606	veor		$Xh,$Xh,$Yh
607	veor		$Xm,$Xm,$Ym
608	b		.Ldone4x
609
610.align	4
611.Ltwo:
612	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
613	veor		$t2,$Xl,$Xh
614	veor		$Xm,$Xm,$t1
615	 vld1.64	{$I0-$j1},[$inp]
616	veor		$Xm,$Xm,$t2
617#ifndef	__ARMEB__
618	 vrev64.8	$j1,$j1
619	 vrev64.8	$I0,$I0
620#endif
621
622	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
623	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
624	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
625	 vext.8		$I1,$j1,$j1,#8
626	veor		$Xl,$Xm,$t2
627
628	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
629	vpmull.p64	$Xl,$Xl,$xC2
630	veor		$t2,$t2,$Xh
631	veor		$Xl,$Xl,$t2
632	vext.8		$Xl,$Xl,$Xl,#8
633
634	 vpmull.p64	$Yl,$H,$I1		@ H·Ii+1
635	 veor		$j1,$j1,$I1
636
637	veor		$t0,$I0,$Xl
638	vext.8		$IN,$t0,$t0,#8
639
640	 vpmull2.p64	$Yh,$H,$I1
641	 vpmull.p64	$Ym,$Hhl,$j1
642
643	vpmull.p64	$Xl,$H2,$IN		@ H^2·(Xi+Ii)
644	veor		$t0,$t0,$IN
645	vpmull2.p64	$Xh,$H2,$IN
646	vpmull2.p64	$Xm,$Hhl,$t0
647
648	veor		$Xl,$Xl,$Yl
649	veor		$Xh,$Xh,$Yh
650	veor		$Xm,$Xm,$Ym
651	b		.Ldone4x
652
653.align	4
654.Lone:
655	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
656	veor		$t2,$Xl,$Xh
657	veor		$Xm,$Xm,$t1
658	 vld1.64	{$I0},[$inp]
659	veor		$Xm,$Xm,$t2
660#ifndef	__ARMEB__
661	 vrev64.8	$I0,$I0
662#endif
663
664	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
665	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
666	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
667	veor		$Xl,$Xm,$t2
668
669	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
670	vpmull.p64	$Xl,$Xl,$xC2
671	veor		$t2,$t2,$Xh
672	veor		$Xl,$Xl,$t2
673	vext.8		$Xl,$Xl,$Xl,#8
674
675	veor		$t0,$I0,$Xl
676	vext.8		$IN,$t0,$t0,#8
677
678	vpmull.p64	$Xl,$H,$IN
679	veor		$t0,$t0,$IN
680	vpmull2.p64	$Xh,$H,$IN
681	vpmull.p64	$Xm,$Hhl,$t0
682
683.Ldone4x:
684	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
685	veor		$t2,$Xl,$Xh
686	veor		$Xm,$Xm,$t1
687	veor		$Xm,$Xm,$t2
688
689	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
690	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
691	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
692	veor		$Xl,$Xm,$t2
693
694	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
695	vpmull.p64	$Xl,$Xl,$xC2
696	veor		$t2,$t2,$Xh
697	veor		$Xl,$Xl,$t2
698	vext.8		$Xl,$Xl,$Xl,#8
699
700#ifndef __ARMEB__
701	vrev64.8	$Xl,$Xl
702#endif
703	vst1.64		{$Xl},[$Xi]		@ write out Xi
704
705	ret
706.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
707___
708
709}
710}
711
712$code.=<<___;
713.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
714.align  2
715#endif
716___
717
718if ($flavour =~ /64/) {			######## 64-bit code
719    sub unvmov {
720	my $arg=shift;
721
722	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
723	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
724					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
725    }
726    foreach(split("\n",$code)) {
727	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
728	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
729	s/vmov\s+(.*)/unvmov($1)/geo	or
730	s/vext\.8/ext/o			or
731	s/vshr\.s/sshr\.s/o		or
732	s/vshr/ushr/o			or
733	s/^(\s+)v/$1/o			or	# strip off v prefix
734	s/\bbx\s+lr\b/ret/o;
735
736	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
737	s/@\s/\/\//o;				# old->new style commentary
738
739	# fix up remaining legacy suffixes
740	s/\.[ui]?8(\s)/$1/o;
741	s/\.[uis]?32//o and s/\.16b/\.4s/go;
742	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
743	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
744	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
745	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
746
747	print $_,"\n";
748    }
749} else {				######## 32-bit code
750    sub unvdup32 {
751	my $arg=shift;
752
753	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
754	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
755    }
756    sub unvpmullp64 {
757	my ($mnemonic,$arg)=@_;
758
759	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
760	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
761				 |(($2&7)<<17)|(($2&8)<<4)
762				 |(($3&7)<<1) |(($3&8)<<2);
763	    $word |= 0x00010001	 if ($mnemonic =~ "2");
764	    # since ARMv7 instructions are always encoded little-endian.
765	    # correct solution is to use .inst directive, but older
766	    # assemblers don't implement it:-(
767	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
768			$word&0xff,($word>>8)&0xff,
769			($word>>16)&0xff,($word>>24)&0xff,
770			$mnemonic,$arg;
771	}
772    }
773
774    foreach(split("\n",$code)) {
775	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
776	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
777	s/\/\/\s?/@ /o;				# new->old style commentary
778
779	# fix up remaining new-style suffixes
780	s/\],#[0-9]+/]!/o;
781
782	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o			or
783	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
784	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
785	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
786	s/^(\s+)b\./$1b/o						or
787	s/^(\s+)ret/$1bx\tlr/o;
788
789	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
790	    print "     it      $2\n";
791	}
792
793	print $_,"\n";
794    }
795}
796
797close STDOUT or die "error closing STDOUT: $!"; # enforce flush
798