1#! /usr/bin/env perl
2# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Permission to use under GPL terms is granted.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47$flavour = shift;
48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
50
51if ($flavour && $flavour ne "void") {
52    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
55    die "can't locate arm-xlate.pl";
56
57    open STDOUT,"| \"$^X\" $xlate $flavour $output";
58} else {
59    open STDOUT,">$output";
60}
61
62$ctx="r0";	$t0="r0";
63$inp="r1";	$t4="r1";
64$len="r2";	$t1="r2";
65$T1="r3";	$t3="r3";
66$A="r4";
67$B="r5";
68$C="r6";
69$D="r7";
70$E="r8";
71$F="r9";
72$G="r10";
73$H="r11";
74@V=($A,$B,$C,$D,$E,$F,$G,$H);
75$t2="r12";
76$Ktbl="r14";
77
78@Sigma0=( 2,13,22);
79@Sigma1=( 6,11,25);
80@sigma0=( 7,18, 3);
81@sigma1=(17,19,10);
82
83sub BODY_00_15 {
84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
85
86$code.=<<___ if ($i<16);
87#if __ARM_ARCH__>=7
88	@ ldr	$t1,[$inp],#4			@ $i
89# if $i==15
90	str	$inp,[sp,#17*4]			@ make room for $t4
91# endif
92	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
93	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
94	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
95# ifndef __ARMEB__
96	rev	$t1,$t1
97# endif
98#else
99	@ ldrb	$t1,[$inp,#3]			@ $i
100	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
101	ldrb	$t2,[$inp,#2]
102	ldrb	$t0,[$inp,#1]
103	orr	$t1,$t1,$t2,lsl#8
104	ldrb	$t2,[$inp],#4
105	orr	$t1,$t1,$t0,lsl#16
106# if $i==15
107	str	$inp,[sp,#17*4]			@ make room for $t4
108# endif
109	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
110	orr	$t1,$t1,$t2,lsl#24
111	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
112#endif
113___
114$code.=<<___;
115	ldr	$t2,[$Ktbl],#4			@ *K256++
116	add	$h,$h,$t1			@ h+=X[i]
117	str	$t1,[sp,#`$i%16`*4]
118	eor	$t1,$f,$g
119	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
120	and	$t1,$t1,$e
121	add	$h,$h,$t2			@ h+=K256[i]
122	eor	$t1,$t1,$g			@ Ch(e,f,g)
123	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
124	add	$h,$h,$t1			@ h+=Ch(e,f,g)
125#if $i==31
126	and	$t2,$t2,#0xff
127	cmp	$t2,#0xf2			@ done?
128#endif
129#if $i<15
130# if __ARM_ARCH__>=7
131	ldr	$t1,[$inp],#4			@ prefetch
132# else
133	ldrb	$t1,[$inp,#3]
134# endif
135	eor	$t2,$a,$b			@ a^b, b^c in next round
136#else
137	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
138	eor	$t2,$a,$b			@ a^b, b^c in next round
139	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
140#endif
141	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
142	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
143	add	$d,$d,$h			@ d+=h
144	eor	$t3,$t3,$b			@ Maj(a,b,c)
145	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
146	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
147___
148	($t2,$t3)=($t3,$t2);
149}
150
151sub BODY_16_XX {
152my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
153
154$code.=<<___;
155	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
156	@ ldr	$t4,[sp,#`($i+14)%16`*4]
157	mov	$t0,$t1,ror#$sigma0[0]
158	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
159	mov	$t2,$t4,ror#$sigma1[0]
160	eor	$t0,$t0,$t1,ror#$sigma0[1]
161	eor	$t2,$t2,$t4,ror#$sigma1[1]
162	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
163	ldr	$t1,[sp,#`($i+0)%16`*4]
164	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
165	ldr	$t4,[sp,#`($i+9)%16`*4]
166
167	add	$t2,$t2,$t0
168	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
169	add	$t1,$t1,$t2
170	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
171	add	$t1,$t1,$t4			@ X[i]
172___
173	&BODY_00_15(@_);
174}
175
176$code=<<___;
177#ifndef __KERNEL__
178# include "arm_arch.h"
179#else
180# define __ARM_ARCH__ __LINUX_ARM_ARCH__
181# define __ARM_MAX_ARCH__ 7
182#endif
183
184.text
185#if defined(__thumb2__)
186.syntax unified
187.thumb
188#else
189.code   32
190#endif
191
192.type	K256,%object
193.align	5
194K256:
195.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
196.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
197.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
198.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
199.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
200.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
201.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
202.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
203.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
204.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
205.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
206.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
207.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
208.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
209.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
210.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
211.size	K256,.-K256
212.word	0				@ terminator
213#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
214.LOPENSSL_armcap:
215.word	OPENSSL_armcap_P-.Lsha256_block_data_order
216#endif
217.align	5
218
219.global	sha256_block_data_order
220.type	sha256_block_data_order,%function
221sha256_block_data_order:
222.Lsha256_block_data_order:
223#if __ARM_ARCH__<7 && !defined(__thumb2__)
224	sub	r3,pc,#8		@ sha256_block_data_order
225#else
226	adr	r3,.Lsha256_block_data_order
227#endif
228#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
229	ldr	r12,.LOPENSSL_armcap
230	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
231#ifdef	__APPLE__
232	ldr	r12,[r12]
233#endif
234	tst	r12,#ARMV8_SHA256
235	bne	.LARMv8
236	tst	r12,#ARMV7_NEON
237	bne	.LNEON
238#endif
239	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
240	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
241	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
242	sub	$Ktbl,r3,#256+32	@ K256
243	sub	sp,sp,#16*4		@ alloca(X[16])
244.Loop:
245# if __ARM_ARCH__>=7
246	ldr	$t1,[$inp],#4
247# else
248	ldrb	$t1,[$inp,#3]
249# endif
250	eor	$t3,$B,$C		@ magic
251	eor	$t2,$t2,$t2
252___
253for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
254$code.=".Lrounds_16_xx:\n";
255for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
256$code.=<<___;
257#ifdef	__thumb2__
258	ite	eq			@ Thumb2 thing, sanity check in ARM
259#endif
260	ldreq	$t3,[sp,#16*4]		@ pull ctx
261	bne	.Lrounds_16_xx
262
263	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
264	ldr	$t0,[$t3,#0]
265	ldr	$t1,[$t3,#4]
266	ldr	$t2,[$t3,#8]
267	add	$A,$A,$t0
268	ldr	$t0,[$t3,#12]
269	add	$B,$B,$t1
270	ldr	$t1,[$t3,#16]
271	add	$C,$C,$t2
272	ldr	$t2,[$t3,#20]
273	add	$D,$D,$t0
274	ldr	$t0,[$t3,#24]
275	add	$E,$E,$t1
276	ldr	$t1,[$t3,#28]
277	add	$F,$F,$t2
278	ldr	$inp,[sp,#17*4]		@ pull inp
279	ldr	$t2,[sp,#18*4]		@ pull inp+len
280	add	$G,$G,$t0
281	add	$H,$H,$t1
282	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
283	cmp	$inp,$t2
284	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
285	bne	.Loop
286
287	add	sp,sp,#`16+3`*4	@ destroy frame
288#if __ARM_ARCH__>=5
289	ldmia	sp!,{r4-r11,pc}
290#else
291	ldmia	sp!,{r4-r11,lr}
292	tst	lr,#1
293	moveq	pc,lr			@ be binary compatible with V4, yet
294	bx	lr			@ interoperable with Thumb ISA:-)
295#endif
296.size	sha256_block_data_order,.-sha256_block_data_order
297___
298######################################################################
299# NEON stuff
300#
301{{{
302my @X=map("q$_",(0..3));
303my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
304my $Xfer=$t4;
305my $j=0;
306
307sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
308sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
309
310sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
311{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
312  my $arg = pop;
313    $arg = "#$arg" if ($arg*1 eq $arg);
314    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
315}
316
317sub Xupdate()
318{ use integer;
319  my $body = shift;
320  my @insns = (&$body,&$body,&$body,&$body);
321  my ($a,$b,$c,$d,$e,$f,$g,$h);
322
323	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
324	 eval(shift(@insns));
325	 eval(shift(@insns));
326	 eval(shift(@insns));
327	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
328	 eval(shift(@insns));
329	 eval(shift(@insns));
330	 eval(shift(@insns));
331	&vshr_u32	($T2,$T0,$sigma0[0]);
332	 eval(shift(@insns));
333	 eval(shift(@insns));
334	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
335	 eval(shift(@insns));
336	 eval(shift(@insns));
337	&vshr_u32	($T1,$T0,$sigma0[2]);
338	 eval(shift(@insns));
339	 eval(shift(@insns));
340	&vsli_32	($T2,$T0,32-$sigma0[0]);
341	 eval(shift(@insns));
342	 eval(shift(@insns));
343	&vshr_u32	($T3,$T0,$sigma0[1]);
344	 eval(shift(@insns));
345	 eval(shift(@insns));
346	&veor		($T1,$T1,$T2);
347	 eval(shift(@insns));
348	 eval(shift(@insns));
349	&vsli_32	($T3,$T0,32-$sigma0[1]);
350	 eval(shift(@insns));
351	 eval(shift(@insns));
352	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
353	 eval(shift(@insns));
354	 eval(shift(@insns));
355	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
356	 eval(shift(@insns));
357	 eval(shift(@insns));
358	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
359	 eval(shift(@insns));
360	 eval(shift(@insns));
361	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
362	 eval(shift(@insns));
363	 eval(shift(@insns));
364	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
365	 eval(shift(@insns));
366	 eval(shift(@insns));
367	  &veor		($T5,$T5,$T4);
368	 eval(shift(@insns));
369	 eval(shift(@insns));
370	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
371	 eval(shift(@insns));
372	 eval(shift(@insns));
373	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
374	 eval(shift(@insns));
375	 eval(shift(@insns));
376	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
377	 eval(shift(@insns));
378	 eval(shift(@insns));
379	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
380	 eval(shift(@insns));
381	 eval(shift(@insns));
382	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
383	 eval(shift(@insns));
384	 eval(shift(@insns));
385	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
386	 eval(shift(@insns));
387	 eval(shift(@insns));
388	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
389	 eval(shift(@insns));
390	 eval(shift(@insns));
391	  &veor		($T5,$T5,$T4);
392	 eval(shift(@insns));
393	 eval(shift(@insns));
394	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
395	 eval(shift(@insns));
396	 eval(shift(@insns));
397	&vld1_32	("{$T0}","[$Ktbl,:128]!");
398	 eval(shift(@insns));
399	 eval(shift(@insns));
400	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
404	 eval(shift(@insns));
405	 eval(shift(@insns));
406	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
407	 eval(shift(@insns));
408	 eval(shift(@insns));
409	&vadd_i32	($T0,$T0,@X[0]);
410	 while($#insns>=2) { eval(shift(@insns)); }
411	&vst1_32	("{$T0}","[$Xfer,:128]!");
412	 eval(shift(@insns));
413	 eval(shift(@insns));
414
415	push(@X,shift(@X));		# "rotate" X[]
416}
417
418sub Xpreload()
419{ use integer;
420  my $body = shift;
421  my @insns = (&$body,&$body,&$body,&$body);
422  my ($a,$b,$c,$d,$e,$f,$g,$h);
423
424	 eval(shift(@insns));
425	 eval(shift(@insns));
426	 eval(shift(@insns));
427	 eval(shift(@insns));
428	&vld1_32	("{$T0}","[$Ktbl,:128]!");
429	 eval(shift(@insns));
430	 eval(shift(@insns));
431	 eval(shift(@insns));
432	 eval(shift(@insns));
433	&vrev32_8	(@X[0],@X[0]);
434	 eval(shift(@insns));
435	 eval(shift(@insns));
436	 eval(shift(@insns));
437	 eval(shift(@insns));
438	&vadd_i32	($T0,$T0,@X[0]);
439	 foreach (@insns) { eval; }	# remaining instructions
440	&vst1_32	("{$T0}","[$Xfer,:128]!");
441
442	push(@X,shift(@X));		# "rotate" X[]
443}
444
445sub body_00_15 () {
446	(
447	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
448	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
449	'&eor	($t1,$f,$g)',
450	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
451	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
452	'&and	($t1,$t1,$e)',
453	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
454	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
455	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
456	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
457	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
458	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
459	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
460	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
461	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
462	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
463	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
464	'&add	($d,$d,$h)',			# d+=h
465	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
466	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
467	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
468	)
469}
470
471$code.=<<___;
472#if __ARM_MAX_ARCH__>=7
473.arch	armv7-a
474.fpu	neon
475
476.global	sha256_block_data_order_neon
477.type	sha256_block_data_order_neon,%function
478.align	5
479.skip	16
480sha256_block_data_order_neon:
481.LNEON:
482	stmdb	sp!,{r4-r12,lr}
483
484	sub	$H,sp,#16*4+16
485	adr	$Ktbl,K256
486	bic	$H,$H,#15		@ align for 128-bit stores
487	mov	$t2,sp
488	mov	sp,$H			@ alloca
489	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
490
491	vld1.8		{@X[0]},[$inp]!
492	vld1.8		{@X[1]},[$inp]!
493	vld1.8		{@X[2]},[$inp]!
494	vld1.8		{@X[3]},[$inp]!
495	vld1.32		{$T0},[$Ktbl,:128]!
496	vld1.32		{$T1},[$Ktbl,:128]!
497	vld1.32		{$T2},[$Ktbl,:128]!
498	vld1.32		{$T3},[$Ktbl,:128]!
499	vrev32.8	@X[0],@X[0]		@ yes, even on
500	str		$ctx,[sp,#64]
501	vrev32.8	@X[1],@X[1]		@ big-endian
502	str		$inp,[sp,#68]
503	mov		$Xfer,sp
504	vrev32.8	@X[2],@X[2]
505	str		$len,[sp,#72]
506	vrev32.8	@X[3],@X[3]
507	str		$t2,[sp,#76]		@ save original sp
508	vadd.i32	$T0,$T0,@X[0]
509	vadd.i32	$T1,$T1,@X[1]
510	vst1.32		{$T0},[$Xfer,:128]!
511	vadd.i32	$T2,$T2,@X[2]
512	vst1.32		{$T1},[$Xfer,:128]!
513	vadd.i32	$T3,$T3,@X[3]
514	vst1.32		{$T2},[$Xfer,:128]!
515	vst1.32		{$T3},[$Xfer,:128]!
516
517	ldmia		$ctx,{$A-$H}
518	sub		$Xfer,$Xfer,#64
519	ldr		$t1,[sp,#0]
520	eor		$t2,$t2,$t2
521	eor		$t3,$B,$C
522	b		.L_00_48
523
524.align	4
525.L_00_48:
526___
527	&Xupdate(\&body_00_15);
528	&Xupdate(\&body_00_15);
529	&Xupdate(\&body_00_15);
530	&Xupdate(\&body_00_15);
531$code.=<<___;
532	teq	$t1,#0				@ check for K256 terminator
533	ldr	$t1,[sp,#0]
534	sub	$Xfer,$Xfer,#64
535	bne	.L_00_48
536
537	ldr		$inp,[sp,#68]
538	ldr		$t0,[sp,#72]
539	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
540	teq		$inp,$t0
541	it		eq
542	subeq		$inp,$inp,#64		@ avoid SEGV
543	vld1.8		{@X[0]},[$inp]!		@ load next input block
544	vld1.8		{@X[1]},[$inp]!
545	vld1.8		{@X[2]},[$inp]!
546	vld1.8		{@X[3]},[$inp]!
547	it		ne
548	strne		$inp,[sp,#68]
549	mov		$Xfer,sp
550___
551	&Xpreload(\&body_00_15);
552	&Xpreload(\&body_00_15);
553	&Xpreload(\&body_00_15);
554	&Xpreload(\&body_00_15);
555$code.=<<___;
556	ldr	$t0,[$t1,#0]
557	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
558	ldr	$t2,[$t1,#4]
559	ldr	$t3,[$t1,#8]
560	ldr	$t4,[$t1,#12]
561	add	$A,$A,$t0			@ accumulate
562	ldr	$t0,[$t1,#16]
563	add	$B,$B,$t2
564	ldr	$t2,[$t1,#20]
565	add	$C,$C,$t3
566	ldr	$t3,[$t1,#24]
567	add	$D,$D,$t4
568	ldr	$t4,[$t1,#28]
569	add	$E,$E,$t0
570	str	$A,[$t1],#4
571	add	$F,$F,$t2
572	str	$B,[$t1],#4
573	add	$G,$G,$t3
574	str	$C,[$t1],#4
575	add	$H,$H,$t4
576	str	$D,[$t1],#4
577	stmia	$t1,{$E-$H}
578
579	ittte	ne
580	movne	$Xfer,sp
581	ldrne	$t1,[sp,#0]
582	eorne	$t2,$t2,$t2
583	ldreq	sp,[sp,#76]			@ restore original sp
584	itt	ne
585	eorne	$t3,$B,$C
586	bne	.L_00_48
587
588	ldmia	sp!,{r4-r12,pc}
589.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
590#endif
591___
592}}}
593######################################################################
594# ARMv8 stuff
595#
596{{{
597my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
598my @MSG=map("q$_",(8..11));
599my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
600my $Ktbl="r3";
601
602$code.=<<___;
603#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
604
605# if defined(__thumb2__)
606#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
607# else
608#  define INST(a,b,c,d)	.byte	a,b,c,d
609# endif
610
611.type	sha256_block_data_order_armv8,%function
612.align	5
613sha256_block_data_order_armv8:
614.LARMv8:
615	vld1.32	{$ABCD,$EFGH},[$ctx]
616	sub	$Ktbl,$Ktbl,#256+32
617	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
618	b	.Loop_v8
619
620.align	4
621.Loop_v8:
622	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
623	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
624	vld1.32		{$W0},[$Ktbl]!
625	vrev32.8	@MSG[0],@MSG[0]
626	vrev32.8	@MSG[1],@MSG[1]
627	vrev32.8	@MSG[2],@MSG[2]
628	vrev32.8	@MSG[3],@MSG[3]
629	vmov		$ABCD_SAVE,$ABCD	@ offload
630	vmov		$EFGH_SAVE,$EFGH
631	teq		$inp,$len
632___
633for($i=0;$i<12;$i++) {
634$code.=<<___;
635	vld1.32		{$W1},[$Ktbl]!
636	vadd.i32	$W0,$W0,@MSG[0]
637	sha256su0	@MSG[0],@MSG[1]
638	vmov		$abcd,$ABCD
639	sha256h		$ABCD,$EFGH,$W0
640	sha256h2	$EFGH,$abcd,$W0
641	sha256su1	@MSG[0],@MSG[2],@MSG[3]
642___
643	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
644}
645$code.=<<___;
646	vld1.32		{$W1},[$Ktbl]!
647	vadd.i32	$W0,$W0,@MSG[0]
648	vmov		$abcd,$ABCD
649	sha256h		$ABCD,$EFGH,$W0
650	sha256h2	$EFGH,$abcd,$W0
651
652	vld1.32		{$W0},[$Ktbl]!
653	vadd.i32	$W1,$W1,@MSG[1]
654	vmov		$abcd,$ABCD
655	sha256h		$ABCD,$EFGH,$W1
656	sha256h2	$EFGH,$abcd,$W1
657
658	vld1.32		{$W1},[$Ktbl]
659	vadd.i32	$W0,$W0,@MSG[2]
660	sub		$Ktbl,$Ktbl,#256-16	@ rewind
661	vmov		$abcd,$ABCD
662	sha256h		$ABCD,$EFGH,$W0
663	sha256h2	$EFGH,$abcd,$W0
664
665	vadd.i32	$W1,$W1,@MSG[3]
666	vmov		$abcd,$ABCD
667	sha256h		$ABCD,$EFGH,$W1
668	sha256h2	$EFGH,$abcd,$W1
669
670	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
671	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
672	it		ne
673	bne		.Loop_v8
674
675	vst1.32		{$ABCD,$EFGH},[$ctx]
676
677	ret		@ bx lr
678.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
679#endif
680___
681}}}
682$code.=<<___;
683.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
684.align	2
685#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
686.comm   OPENSSL_armcap_P,4,4
687#endif
688___
689
690open SELF,$0;
691while(<SELF>) {
692	next if (/^#!/);
693	last if (!s/^#/@/ and !/^$/);
694	print;
695}
696close SELF;
697
698{   my  %opcode = (
699	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
700	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
701
702    sub unsha256 {
703	my ($mnemonic,$arg)=@_;
704
705	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
706	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
707					 |(($2&7)<<17)|(($2&8)<<4)
708					 |(($3&7)<<1) |(($3&8)<<2);
709	    # since ARMv7 instructions are always encoded little-endian.
710	    # correct solution is to use .inst directive, but older
711	    # assemblers don't implement it:-(
712	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
713			$word&0xff,($word>>8)&0xff,
714			($word>>16)&0xff,($word>>24)&0xff,
715			$mnemonic,$arg;
716	}
717    }
718}
719
720foreach (split($/,$code)) {
721
722	s/\`([^\`]*)\`/eval $1/geo;
723
724	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
725
726	s/\bret\b/bx	lr/go		or
727	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
728
729	print $_,"\n";
730}
731
732close STDOUT; # enforce flush
733