1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Permission to use under GPL terms is granted.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47$flavour = shift;
48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
50
51if ($flavour && $flavour ne "void") {
52    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
55    die "can't locate arm-xlate.pl";
56
57    open OUT,"| \"$^X\" $xlate $flavour $output";
58    *STDOUT=*OUT;
59} else {
60    open OUT,">$output";
61    *STDOUT=*OUT;
62}
63
64$ctx="r0";	$t0="r0";
65$inp="r1";	$t4="r1";
66$len="r2";	$t1="r2";
67$T1="r3";	$t3="r3";
68$A="r4";
69$B="r5";
70$C="r6";
71$D="r7";
72$E="r8";
73$F="r9";
74$G="r10";
75$H="r11";
76@V=($A,$B,$C,$D,$E,$F,$G,$H);
77$t2="r12";
78$Ktbl="r14";
79
80@Sigma0=( 2,13,22);
81@Sigma1=( 6,11,25);
82@sigma0=( 7,18, 3);
83@sigma1=(17,19,10);
84
85sub BODY_00_15 {
86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88$code.=<<___ if ($i<16);
89#if __ARM_ARCH__>=7
90	@ ldr	$t1,[$inp],#4			@ $i
91# if $i==15
92	str	$inp,[sp,#17*4]			@ make room for $t4
93# endif
94	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
95	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
96	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
97# ifndef __ARMEB__
98	rev	$t1,$t1
99# endif
100#else
101	@ ldrb	$t1,[$inp,#3]			@ $i
102	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
103	ldrb	$t2,[$inp,#2]
104	ldrb	$t0,[$inp,#1]
105	orr	$t1,$t1,$t2,lsl#8
106	ldrb	$t2,[$inp],#4
107	orr	$t1,$t1,$t0,lsl#16
108# if $i==15
109	str	$inp,[sp,#17*4]			@ make room for $t4
110# endif
111	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
112	orr	$t1,$t1,$t2,lsl#24
113	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
114#endif
115___
116$code.=<<___;
117	ldr	$t2,[$Ktbl],#4			@ *K256++
118	add	$h,$h,$t1			@ h+=X[i]
119	str	$t1,[sp,#`$i%16`*4]
120	eor	$t1,$f,$g
121	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
122	and	$t1,$t1,$e
123	add	$h,$h,$t2			@ h+=K256[i]
124	eor	$t1,$t1,$g			@ Ch(e,f,g)
125	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
126	add	$h,$h,$t1			@ h+=Ch(e,f,g)
127#if $i==31
128	and	$t2,$t2,#0xff
129	cmp	$t2,#0xf2			@ done?
130#endif
131#if $i<15
132# if __ARM_ARCH__>=7
133	ldr	$t1,[$inp],#4			@ prefetch
134# else
135	ldrb	$t1,[$inp,#3]
136# endif
137	eor	$t2,$a,$b			@ a^b, b^c in next round
138#else
139	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
140	eor	$t2,$a,$b			@ a^b, b^c in next round
141	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
142#endif
143	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
144	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
145	add	$d,$d,$h			@ d+=h
146	eor	$t3,$t3,$b			@ Maj(a,b,c)
147	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
148	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
149___
150	($t2,$t3)=($t3,$t2);
151}
152
153sub BODY_16_XX {
154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156$code.=<<___;
157	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
158	@ ldr	$t4,[sp,#`($i+14)%16`*4]
159	mov	$t0,$t1,ror#$sigma0[0]
160	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
161	mov	$t2,$t4,ror#$sigma1[0]
162	eor	$t0,$t0,$t1,ror#$sigma0[1]
163	eor	$t2,$t2,$t4,ror#$sigma1[1]
164	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
165	ldr	$t1,[sp,#`($i+0)%16`*4]
166	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
167	ldr	$t4,[sp,#`($i+9)%16`*4]
168
169	add	$t2,$t2,$t0
170	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
171	add	$t1,$t1,$t2
172	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
173	add	$t1,$t1,$t4			@ X[i]
174___
175	&BODY_00_15(@_);
176}
177
178$code=<<___;
179#ifndef __KERNEL__
180# include <GFp/arm_arch.h>
181#else
182# define __ARM_ARCH__ __LINUX_ARM_ARCH__
183# define __ARM_MAX_ARCH__ 7
184#endif
185
186@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
187@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
188@ instructions are manually-encoded. (See unsha256.)
189.arch  armv7-a
190
191.text
192#if defined(__thumb2__)
193.syntax unified
194.thumb
195#else
196.code   32
197#endif
198
199.type	K256,%object
200.align	5
201K256:
202.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
203.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
204.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
205.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
206.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
207.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
208.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
209.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
210.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
211.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
212.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
213.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
214.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
215.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
216.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
217.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
218.size	K256,.-K256
219.word	0				@ terminator
220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221.extern GFp_armcap_P
222.hidden GFp_armcap_P
223.LOPENSSL_armcap:
224.word	GFp_armcap_P-.Lsha256_block_data_order
225#endif
226.align	5
227
228.global	GFp_sha256_block_data_order
229.type	GFp_sha256_block_data_order,%function
230GFp_sha256_block_data_order:
231.Lsha256_block_data_order:
232#if __ARM_ARCH__<7 && !defined(__thumb2__)
233	sub	r3,pc,#8		@ GFp_sha256_block_data_order
234#else
235	adr	r3,.Lsha256_block_data_order
236#endif
237#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
238	ldr	r12,.LOPENSSL_armcap
239	ldr	r12,[r3,r12]		@ GFp_armcap_P
240#ifdef	__APPLE__
241	ldr	r12,[r12]
242#endif
243	tst	r12,#ARMV8_SHA256
244	bne	.LARMv8
245	tst	r12,#ARMV7_NEON
246	bne	.LNEON
247#endif
248	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
249	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
250	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
251	sub	$Ktbl,r3,#256+32	@ K256
252	sub	sp,sp,#16*4		@ alloca(X[16])
253.Loop:
254# if __ARM_ARCH__>=7
255	ldr	$t1,[$inp],#4
256# else
257	ldrb	$t1,[$inp,#3]
258# endif
259	eor	$t3,$B,$C		@ magic
260	eor	$t2,$t2,$t2
261___
262for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
263$code.=".Lrounds_16_xx:\n";
264for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
265$code.=<<___;
266#if __ARM_ARCH__>=7
267	ite	eq			@ Thumb2 thing, sanity check in ARM
268#endif
269	ldreq	$t3,[sp,#16*4]		@ pull ctx
270	bne	.Lrounds_16_xx
271
272	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
273	ldr	$t0,[$t3,#0]
274	ldr	$t1,[$t3,#4]
275	ldr	$t2,[$t3,#8]
276	add	$A,$A,$t0
277	ldr	$t0,[$t3,#12]
278	add	$B,$B,$t1
279	ldr	$t1,[$t3,#16]
280	add	$C,$C,$t2
281	ldr	$t2,[$t3,#20]
282	add	$D,$D,$t0
283	ldr	$t0,[$t3,#24]
284	add	$E,$E,$t1
285	ldr	$t1,[$t3,#28]
286	add	$F,$F,$t2
287	ldr	$inp,[sp,#17*4]		@ pull inp
288	ldr	$t2,[sp,#18*4]		@ pull inp+len
289	add	$G,$G,$t0
290	add	$H,$H,$t1
291	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
292	cmp	$inp,$t2
293	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
294	bne	.Loop
295
296	add	sp,sp,#`16+3`*4	@ destroy frame
297#if __ARM_ARCH__>=5
298	ldmia	sp!,{r4-r11,pc}
299#else
300	ldmia	sp!,{r4-r11,lr}
301	tst	lr,#1
302	moveq	pc,lr			@ be binary compatible with V4, yet
303	bx	lr			@ interoperable with Thumb ISA:-)
304#endif
305.size	GFp_sha256_block_data_order,.-GFp_sha256_block_data_order
306___
307######################################################################
308# NEON stuff
309#
310{{{
311my @X=map("q$_",(0..3));
312my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
313my $Xfer=$t4;
314my $j=0;
315
316sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
317sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
318
319sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
320{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
321  my $arg = pop;
322    $arg = "#$arg" if ($arg*1 eq $arg);
323    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
324}
325
326sub Xupdate()
327{ use integer;
328  my $body = shift;
329  my @insns = (&$body,&$body,&$body,&$body);
330  my ($a,$b,$c,$d,$e,$f,$g,$h);
331
332	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
333	 eval(shift(@insns));
334	 eval(shift(@insns));
335	 eval(shift(@insns));
336	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
337	 eval(shift(@insns));
338	 eval(shift(@insns));
339	 eval(shift(@insns));
340	&vshr_u32	($T2,$T0,$sigma0[0]);
341	 eval(shift(@insns));
342	 eval(shift(@insns));
343	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
344	 eval(shift(@insns));
345	 eval(shift(@insns));
346	&vshr_u32	($T1,$T0,$sigma0[2]);
347	 eval(shift(@insns));
348	 eval(shift(@insns));
349	&vsli_32	($T2,$T0,32-$sigma0[0]);
350	 eval(shift(@insns));
351	 eval(shift(@insns));
352	&vshr_u32	($T3,$T0,$sigma0[1]);
353	 eval(shift(@insns));
354	 eval(shift(@insns));
355	&veor		($T1,$T1,$T2);
356	 eval(shift(@insns));
357	 eval(shift(@insns));
358	&vsli_32	($T3,$T0,32-$sigma0[1]);
359	 eval(shift(@insns));
360	 eval(shift(@insns));
361	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
362	 eval(shift(@insns));
363	 eval(shift(@insns));
364	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
365	 eval(shift(@insns));
366	 eval(shift(@insns));
367	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
368	 eval(shift(@insns));
369	 eval(shift(@insns));
370	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
371	 eval(shift(@insns));
372	 eval(shift(@insns));
373	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
374	 eval(shift(@insns));
375	 eval(shift(@insns));
376	  &veor		($T5,$T5,$T4);
377	 eval(shift(@insns));
378	 eval(shift(@insns));
379	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
380	 eval(shift(@insns));
381	 eval(shift(@insns));
382	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
383	 eval(shift(@insns));
384	 eval(shift(@insns));
385	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
386	 eval(shift(@insns));
387	 eval(shift(@insns));
388	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
389	 eval(shift(@insns));
390	 eval(shift(@insns));
391	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
392	 eval(shift(@insns));
393	 eval(shift(@insns));
394	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
395	 eval(shift(@insns));
396	 eval(shift(@insns));
397	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
398	 eval(shift(@insns));
399	 eval(shift(@insns));
400	  &veor		($T5,$T5,$T4);
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
404	 eval(shift(@insns));
405	 eval(shift(@insns));
406	&vld1_32	("{$T0}","[$Ktbl,:128]!");
407	 eval(shift(@insns));
408	 eval(shift(@insns));
409	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
410	 eval(shift(@insns));
411	 eval(shift(@insns));
412	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
413	 eval(shift(@insns));
414	 eval(shift(@insns));
415	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
416	 eval(shift(@insns));
417	 eval(shift(@insns));
418	&vadd_i32	($T0,$T0,@X[0]);
419	 while($#insns>=2) { eval(shift(@insns)); }
420	&vst1_32	("{$T0}","[$Xfer,:128]!");
421	 eval(shift(@insns));
422	 eval(shift(@insns));
423
424	push(@X,shift(@X));		# "rotate" X[]
425}
426
427sub Xpreload()
428{ use integer;
429  my $body = shift;
430  my @insns = (&$body,&$body,&$body,&$body);
431  my ($a,$b,$c,$d,$e,$f,$g,$h);
432
433	 eval(shift(@insns));
434	 eval(shift(@insns));
435	 eval(shift(@insns));
436	 eval(shift(@insns));
437	&vld1_32	("{$T0}","[$Ktbl,:128]!");
438	 eval(shift(@insns));
439	 eval(shift(@insns));
440	 eval(shift(@insns));
441	 eval(shift(@insns));
442	&vrev32_8	(@X[0],@X[0]);
443	 eval(shift(@insns));
444	 eval(shift(@insns));
445	 eval(shift(@insns));
446	 eval(shift(@insns));
447	&vadd_i32	($T0,$T0,@X[0]);
448	 foreach (@insns) { eval; }	# remaining instructions
449	&vst1_32	("{$T0}","[$Xfer,:128]!");
450
451	push(@X,shift(@X));		# "rotate" X[]
452}
453
454sub body_00_15 () {
455	(
456	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
457	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
458	'&eor	($t1,$f,$g)',
459	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
460	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
461	'&and	($t1,$t1,$e)',
462	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
463	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
464	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
465	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
466	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
467	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
468	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
469	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
470	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
471	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
472	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
473	'&add	($d,$d,$h)',			# d+=h
474	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
475	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
476	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
477	)
478}
479
480$code.=<<___;
481#if __ARM_MAX_ARCH__>=7
482.arch	armv7-a
483.fpu	neon
484
485.type	sha256_block_data_order_neon,%function
486.align	5
487.skip	16
488sha256_block_data_order_neon:
489.LNEON:
490	stmdb	sp!,{r4-r12,lr}
491
492	sub	$H,sp,#16*4+16
493	adr	$Ktbl,K256
494	bic	$H,$H,#15		@ align for 128-bit stores
495	mov	$t2,sp
496	mov	sp,$H			@ alloca
497	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
498
499	vld1.8		{@X[0]},[$inp]!
500	vld1.8		{@X[1]},[$inp]!
501	vld1.8		{@X[2]},[$inp]!
502	vld1.8		{@X[3]},[$inp]!
503	vld1.32		{$T0},[$Ktbl,:128]!
504	vld1.32		{$T1},[$Ktbl,:128]!
505	vld1.32		{$T2},[$Ktbl,:128]!
506	vld1.32		{$T3},[$Ktbl,:128]!
507	vrev32.8	@X[0],@X[0]		@ yes, even on
508	str		$ctx,[sp,#64]
509	vrev32.8	@X[1],@X[1]		@ big-endian
510	str		$inp,[sp,#68]
511	mov		$Xfer,sp
512	vrev32.8	@X[2],@X[2]
513	str		$len,[sp,#72]
514	vrev32.8	@X[3],@X[3]
515	str		$t2,[sp,#76]		@ save original sp
516	vadd.i32	$T0,$T0,@X[0]
517	vadd.i32	$T1,$T1,@X[1]
518	vst1.32		{$T0},[$Xfer,:128]!
519	vadd.i32	$T2,$T2,@X[2]
520	vst1.32		{$T1},[$Xfer,:128]!
521	vadd.i32	$T3,$T3,@X[3]
522	vst1.32		{$T2},[$Xfer,:128]!
523	vst1.32		{$T3},[$Xfer,:128]!
524
525	ldmia		$ctx,{$A-$H}
526	sub		$Xfer,$Xfer,#64
527	ldr		$t1,[sp,#0]
528	eor		$t2,$t2,$t2
529	eor		$t3,$B,$C
530	b		.L_00_48
531
532.align	4
533.L_00_48:
534___
535	&Xupdate(\&body_00_15);
536	&Xupdate(\&body_00_15);
537	&Xupdate(\&body_00_15);
538	&Xupdate(\&body_00_15);
539$code.=<<___;
540	teq	$t1,#0				@ check for K256 terminator
541	ldr	$t1,[sp,#0]
542	sub	$Xfer,$Xfer,#64
543	bne	.L_00_48
544
545	ldr		$inp,[sp,#68]
546	ldr		$t0,[sp,#72]
547	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
548	teq		$inp,$t0
549	it		eq
550	subeq		$inp,$inp,#64		@ avoid SEGV
551	vld1.8		{@X[0]},[$inp]!		@ load next input block
552	vld1.8		{@X[1]},[$inp]!
553	vld1.8		{@X[2]},[$inp]!
554	vld1.8		{@X[3]},[$inp]!
555	it		ne
556	strne		$inp,[sp,#68]
557	mov		$Xfer,sp
558___
559	&Xpreload(\&body_00_15);
560	&Xpreload(\&body_00_15);
561	&Xpreload(\&body_00_15);
562	&Xpreload(\&body_00_15);
563$code.=<<___;
564	ldr	$t0,[$t1,#0]
565	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
566	ldr	$t2,[$t1,#4]
567	ldr	$t3,[$t1,#8]
568	ldr	$t4,[$t1,#12]
569	add	$A,$A,$t0			@ accumulate
570	ldr	$t0,[$t1,#16]
571	add	$B,$B,$t2
572	ldr	$t2,[$t1,#20]
573	add	$C,$C,$t3
574	ldr	$t3,[$t1,#24]
575	add	$D,$D,$t4
576	ldr	$t4,[$t1,#28]
577	add	$E,$E,$t0
578	str	$A,[$t1],#4
579	add	$F,$F,$t2
580	str	$B,[$t1],#4
581	add	$G,$G,$t3
582	str	$C,[$t1],#4
583	add	$H,$H,$t4
584	str	$D,[$t1],#4
585	stmia	$t1,{$E-$H}
586
587	ittte	ne
588	movne	$Xfer,sp
589	ldrne	$t1,[sp,#0]
590	eorne	$t2,$t2,$t2
591	ldreq	sp,[sp,#76]			@ restore original sp
592	itt	ne
593	eorne	$t3,$B,$C
594	bne	.L_00_48
595
596	ldmia	sp!,{r4-r12,pc}
597.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
598#endif
599___
600}}}
601######################################################################
602# ARMv8 stuff
603#
604{{{
605my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
606my @MSG=map("q$_",(8..11));
607my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
608my $Ktbl="r3";
609
610$code.=<<___;
611#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
612
613# if defined(__thumb2__)
614#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
615# else
616#  define INST(a,b,c,d)	.byte	a,b,c,d
617# endif
618
619.type	sha256_block_data_order_armv8,%function
620.align	5
621sha256_block_data_order_armv8:
622.LARMv8:
623	vld1.32	{$ABCD,$EFGH},[$ctx]
624	sub	$Ktbl,$Ktbl,#256+32
625	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
626	b	.Loop_v8
627
628.align	4
629.Loop_v8:
630	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
631	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
632	vld1.32		{$W0},[$Ktbl]!
633	vrev32.8	@MSG[0],@MSG[0]
634	vrev32.8	@MSG[1],@MSG[1]
635	vrev32.8	@MSG[2],@MSG[2]
636	vrev32.8	@MSG[3],@MSG[3]
637	vmov		$ABCD_SAVE,$ABCD	@ offload
638	vmov		$EFGH_SAVE,$EFGH
639	teq		$inp,$len
640___
641for($i=0;$i<12;$i++) {
642$code.=<<___;
643	vld1.32		{$W1},[$Ktbl]!
644	vadd.i32	$W0,$W0,@MSG[0]
645	sha256su0	@MSG[0],@MSG[1]
646	vmov		$abcd,$ABCD
647	sha256h		$ABCD,$EFGH,$W0
648	sha256h2	$EFGH,$abcd,$W0
649	sha256su1	@MSG[0],@MSG[2],@MSG[3]
650___
651	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
652}
653$code.=<<___;
654	vld1.32		{$W1},[$Ktbl]!
655	vadd.i32	$W0,$W0,@MSG[0]
656	vmov		$abcd,$ABCD
657	sha256h		$ABCD,$EFGH,$W0
658	sha256h2	$EFGH,$abcd,$W0
659
660	vld1.32		{$W0},[$Ktbl]!
661	vadd.i32	$W1,$W1,@MSG[1]
662	vmov		$abcd,$ABCD
663	sha256h		$ABCD,$EFGH,$W1
664	sha256h2	$EFGH,$abcd,$W1
665
666	vld1.32		{$W1},[$Ktbl]
667	vadd.i32	$W0,$W0,@MSG[2]
668	sub		$Ktbl,$Ktbl,#256-16	@ rewind
669	vmov		$abcd,$ABCD
670	sha256h		$ABCD,$EFGH,$W0
671	sha256h2	$EFGH,$abcd,$W0
672
673	vadd.i32	$W1,$W1,@MSG[3]
674	vmov		$abcd,$ABCD
675	sha256h		$ABCD,$EFGH,$W1
676	sha256h2	$EFGH,$abcd,$W1
677
678	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
679	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
680	it		ne
681	bne		.Loop_v8
682
683	vst1.32		{$ABCD,$EFGH},[$ctx]
684
685	ret		@ bx lr
686.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
687#endif
688___
689}}}
690$code.=<<___;
691.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
692___
693
694open SELF,$0;
695while(<SELF>) {
696	next if (/^#!/);
697	last if (!s/^#/@/ and !/^$/);
698	print;
699}
700close SELF;
701
702{   my  %opcode = (
703	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
704	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
705
706    sub unsha256 {
707	my ($mnemonic,$arg)=@_;
708
709	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
710	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
711					 |(($2&7)<<17)|(($2&8)<<4)
712					 |(($3&7)<<1) |(($3&8)<<2);
713	    # since ARMv7 instructions are always encoded little-endian.
714	    # correct solution is to use .inst directive, but older
715	    # assemblers don't implement it:-(
716	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
717			$word&0xff,($word>>8)&0xff,
718			($word>>16)&0xff,($word>>24)&0xff,
719			$mnemonic,$arg;
720	}
721    }
722}
723
724foreach (split($/,$code)) {
725
726	s/\`([^\`]*)\`/eval $1/geo;
727
728	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
729
730	s/\bret\b/bx	lr/go		or
731	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
732
733	print $_,"\n";
734}
735
736close STDOUT or die "error closing STDOUT"; # enforce flush
737