xref: /linux/arch/arm/crypto/sha256-armv4.pl (revision 54781938)
1f2f770d7SSami Tolvanen#!/usr/bin/env perl
2c2e415feSAdam Langley# SPDX-License-Identifier: GPL-2.0
3c2e415feSAdam Langley
4c2e415feSAdam Langley# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5c2e415feSAdam Langley# has relicensed it under the GPLv2. Therefore this program is free software;
6c2e415feSAdam Langley# you can redistribute it and/or modify it under the terms of the GNU General
7c2e415feSAdam Langley# Public License version 2 as published by the Free Software Foundation.
8c2e415feSAdam Langley#
9c2e415feSAdam Langley# The original headers, including the original license headers, are
10c2e415feSAdam Langley# included below for completeness.
11f2f770d7SSami Tolvanen
12f2f770d7SSami Tolvanen# ====================================================================
13f2f770d7SSami Tolvanen# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14f2f770d7SSami Tolvanen# project. The module is, however, dual licensed under OpenSSL and
15f2f770d7SSami Tolvanen# CRYPTOGAMS licenses depending on where you obtain it. For further
169332a9e7SAlexander A. Klimov# details see https://www.openssl.org/~appro/cryptogams/.
17f2f770d7SSami Tolvanen# ====================================================================
18f2f770d7SSami Tolvanen
19f2f770d7SSami Tolvanen# SHA256 block procedure for ARMv4. May 2007.
20f2f770d7SSami Tolvanen
21f2f770d7SSami Tolvanen# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22f2f770d7SSami Tolvanen# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23f2f770d7SSami Tolvanen# byte [on single-issue Xscale PXA250 core].
24f2f770d7SSami Tolvanen
25f2f770d7SSami Tolvanen# July 2010.
26f2f770d7SSami Tolvanen#
27f2f770d7SSami Tolvanen# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28f2f770d7SSami Tolvanen# Cortex A8 core and ~20 cycles per processed byte.
29f2f770d7SSami Tolvanen
30f2f770d7SSami Tolvanen# February 2011.
31f2f770d7SSami Tolvanen#
32f2f770d7SSami Tolvanen# Profiler-assisted and platform-specific optimization resulted in 16%
33f2f770d7SSami Tolvanen# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34f2f770d7SSami Tolvanen
35f2f770d7SSami Tolvanen# September 2013.
36f2f770d7SSami Tolvanen#
37f2f770d7SSami Tolvanen# Add NEON implementation. On Cortex A8 it was measured to process one
38f2f770d7SSami Tolvanen# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39f2f770d7SSami Tolvanen# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40f2f770d7SSami Tolvanen# code (meaning that latter performs sub-optimally, nothing was done
41f2f770d7SSami Tolvanen# about it).
42f2f770d7SSami Tolvanen
43f2f770d7SSami Tolvanen# May 2014.
44f2f770d7SSami Tolvanen#
45f2f770d7SSami Tolvanen# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46f2f770d7SSami Tolvanen
47f2f770d7SSami Tolvanenwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48f2f770d7SSami Tolvanenopen STDOUT,">$output";
49f2f770d7SSami Tolvanen
50f2f770d7SSami Tolvanen$ctx="r0";	$t0="r0";
51f2f770d7SSami Tolvanen$inp="r1";	$t4="r1";
52f2f770d7SSami Tolvanen$len="r2";	$t1="r2";
53f2f770d7SSami Tolvanen$T1="r3";	$t3="r3";
54f2f770d7SSami Tolvanen$A="r4";
55f2f770d7SSami Tolvanen$B="r5";
56f2f770d7SSami Tolvanen$C="r6";
57f2f770d7SSami Tolvanen$D="r7";
58f2f770d7SSami Tolvanen$E="r8";
59f2f770d7SSami Tolvanen$F="r9";
60f2f770d7SSami Tolvanen$G="r10";
61f2f770d7SSami Tolvanen$H="r11";
62f2f770d7SSami Tolvanen@V=($A,$B,$C,$D,$E,$F,$G,$H);
63f2f770d7SSami Tolvanen$t2="r12";
64f2f770d7SSami Tolvanen$Ktbl="r14";
65f2f770d7SSami Tolvanen
66f2f770d7SSami Tolvanen@Sigma0=( 2,13,22);
67f2f770d7SSami Tolvanen@Sigma1=( 6,11,25);
68f2f770d7SSami Tolvanen@sigma0=( 7,18, 3);
69f2f770d7SSami Tolvanen@sigma1=(17,19,10);
70f2f770d7SSami Tolvanen
71f2f770d7SSami Tolvanensub BODY_00_15 {
72f2f770d7SSami Tolvanenmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73f2f770d7SSami Tolvanen
74f2f770d7SSami Tolvanen$code.=<<___ if ($i<16);
75f2f770d7SSami Tolvanen#if __ARM_ARCH__>=7
76f2f770d7SSami Tolvanen	@ ldr	$t1,[$inp],#4			@ $i
77f2f770d7SSami Tolvanen# if $i==15
78f2f770d7SSami Tolvanen	str	$inp,[sp,#17*4]			@ make room for $t4
79f2f770d7SSami Tolvanen# endif
80f2f770d7SSami Tolvanen	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81f2f770d7SSami Tolvanen	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
82f2f770d7SSami Tolvanen	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
83f2f770d7SSami Tolvanen# ifndef __ARMEB__
84f2f770d7SSami Tolvanen	rev	$t1,$t1
85f2f770d7SSami Tolvanen# endif
86f2f770d7SSami Tolvanen#else
87f2f770d7SSami Tolvanen	@ ldrb	$t1,[$inp,#3]			@ $i
88f2f770d7SSami Tolvanen	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
89f2f770d7SSami Tolvanen	ldrb	$t2,[$inp,#2]
90f2f770d7SSami Tolvanen	ldrb	$t0,[$inp,#1]
91f2f770d7SSami Tolvanen	orr	$t1,$t1,$t2,lsl#8
92f2f770d7SSami Tolvanen	ldrb	$t2,[$inp],#4
93f2f770d7SSami Tolvanen	orr	$t1,$t1,$t0,lsl#16
94f2f770d7SSami Tolvanen# if $i==15
95f2f770d7SSami Tolvanen	str	$inp,[sp,#17*4]			@ make room for $t4
96f2f770d7SSami Tolvanen# endif
97f2f770d7SSami Tolvanen	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98f2f770d7SSami Tolvanen	orr	$t1,$t1,$t2,lsl#24
99f2f770d7SSami Tolvanen	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
100f2f770d7SSami Tolvanen#endif
101f2f770d7SSami Tolvanen___
102f2f770d7SSami Tolvanen$code.=<<___;
103f2f770d7SSami Tolvanen	ldr	$t2,[$Ktbl],#4			@ *K256++
104f2f770d7SSami Tolvanen	add	$h,$h,$t1			@ h+=X[i]
105f2f770d7SSami Tolvanen	str	$t1,[sp,#`$i%16`*4]
106f2f770d7SSami Tolvanen	eor	$t1,$f,$g
107f2f770d7SSami Tolvanen	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
108f2f770d7SSami Tolvanen	and	$t1,$t1,$e
109f2f770d7SSami Tolvanen	add	$h,$h,$t2			@ h+=K256[i]
110f2f770d7SSami Tolvanen	eor	$t1,$t1,$g			@ Ch(e,f,g)
111f2f770d7SSami Tolvanen	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112f2f770d7SSami Tolvanen	add	$h,$h,$t1			@ h+=Ch(e,f,g)
113f2f770d7SSami Tolvanen#if $i==31
114f2f770d7SSami Tolvanen	and	$t2,$t2,#0xff
115f2f770d7SSami Tolvanen	cmp	$t2,#0xf2			@ done?
116f2f770d7SSami Tolvanen#endif
117f2f770d7SSami Tolvanen#if $i<15
118f2f770d7SSami Tolvanen# if __ARM_ARCH__>=7
119f2f770d7SSami Tolvanen	ldr	$t1,[$inp],#4			@ prefetch
120f2f770d7SSami Tolvanen# else
121f2f770d7SSami Tolvanen	ldrb	$t1,[$inp,#3]
122f2f770d7SSami Tolvanen# endif
123f2f770d7SSami Tolvanen	eor	$t2,$a,$b			@ a^b, b^c in next round
124f2f770d7SSami Tolvanen#else
125f2f770d7SSami Tolvanen	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
126f2f770d7SSami Tolvanen	eor	$t2,$a,$b			@ a^b, b^c in next round
127f2f770d7SSami Tolvanen	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
128f2f770d7SSami Tolvanen#endif
129f2f770d7SSami Tolvanen	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
130f2f770d7SSami Tolvanen	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
131f2f770d7SSami Tolvanen	add	$d,$d,$h			@ d+=h
132f2f770d7SSami Tolvanen	eor	$t3,$t3,$b			@ Maj(a,b,c)
133f2f770d7SSami Tolvanen	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
134f2f770d7SSami Tolvanen	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
135f2f770d7SSami Tolvanen___
136f2f770d7SSami Tolvanen	($t2,$t3)=($t3,$t2);
137f2f770d7SSami Tolvanen}
138f2f770d7SSami Tolvanen
139f2f770d7SSami Tolvanensub BODY_16_XX {
140f2f770d7SSami Tolvanenmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141f2f770d7SSami Tolvanen
142f2f770d7SSami Tolvanen$code.=<<___;
143f2f770d7SSami Tolvanen	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
144f2f770d7SSami Tolvanen	@ ldr	$t4,[sp,#`($i+14)%16`*4]
145f2f770d7SSami Tolvanen	mov	$t0,$t1,ror#$sigma0[0]
146f2f770d7SSami Tolvanen	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
147f2f770d7SSami Tolvanen	mov	$t2,$t4,ror#$sigma1[0]
148f2f770d7SSami Tolvanen	eor	$t0,$t0,$t1,ror#$sigma0[1]
149f2f770d7SSami Tolvanen	eor	$t2,$t2,$t4,ror#$sigma1[1]
150f2f770d7SSami Tolvanen	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
151f2f770d7SSami Tolvanen	ldr	$t1,[sp,#`($i+0)%16`*4]
152f2f770d7SSami Tolvanen	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
153f2f770d7SSami Tolvanen	ldr	$t4,[sp,#`($i+9)%16`*4]
154f2f770d7SSami Tolvanen
155f2f770d7SSami Tolvanen	add	$t2,$t2,$t0
156f2f770d7SSami Tolvanen	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
157f2f770d7SSami Tolvanen	add	$t1,$t1,$t2
158f2f770d7SSami Tolvanen	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
159f2f770d7SSami Tolvanen	add	$t1,$t1,$t4			@ X[i]
160f2f770d7SSami Tolvanen___
161f2f770d7SSami Tolvanen	&BODY_00_15(@_);
162f2f770d7SSami Tolvanen}
163f2f770d7SSami Tolvanen
164f2f770d7SSami Tolvanen$code=<<___;
165f2f770d7SSami Tolvanen#ifndef __KERNEL__
166f2f770d7SSami Tolvanen# include "arm_arch.h"
167f2f770d7SSami Tolvanen#else
168f2f770d7SSami Tolvanen# define __ARM_ARCH__ __LINUX_ARM_ARCH__
169f2f770d7SSami Tolvanen# define __ARM_MAX_ARCH__ 7
170f2f770d7SSami Tolvanen#endif
171f2f770d7SSami Tolvanen
172f2f770d7SSami Tolvanen.text
173f2f770d7SSami Tolvanen#if __ARM_ARCH__<7
174f2f770d7SSami Tolvanen.code	32
175f2f770d7SSami Tolvanen#else
176f2f770d7SSami Tolvanen.syntax unified
177f2f770d7SSami Tolvanen# ifdef __thumb2__
178f2f770d7SSami Tolvanen.thumb
179f2f770d7SSami Tolvanen# else
180f2f770d7SSami Tolvanen.code   32
181f2f770d7SSami Tolvanen# endif
182f2f770d7SSami Tolvanen#endif
183f2f770d7SSami Tolvanen
184f2f770d7SSami Tolvanen.type	K256,%object
185f2f770d7SSami Tolvanen.align	5
186f2f770d7SSami TolvanenK256:
187f2f770d7SSami Tolvanen.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
188f2f770d7SSami Tolvanen.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
189f2f770d7SSami Tolvanen.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
190f2f770d7SSami Tolvanen.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
191f2f770d7SSami Tolvanen.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
192f2f770d7SSami Tolvanen.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
193f2f770d7SSami Tolvanen.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
194f2f770d7SSami Tolvanen.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
195f2f770d7SSami Tolvanen.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
196f2f770d7SSami Tolvanen.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
197f2f770d7SSami Tolvanen.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
198f2f770d7SSami Tolvanen.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199f2f770d7SSami Tolvanen.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200f2f770d7SSami Tolvanen.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201f2f770d7SSami Tolvanen.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202f2f770d7SSami Tolvanen.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203f2f770d7SSami Tolvanen.size	K256,.-K256
204f2f770d7SSami Tolvanen.word	0				@ terminator
205f2f770d7SSami Tolvanen#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
206f2f770d7SSami Tolvanen.LOPENSSL_armcap:
207f2f770d7SSami Tolvanen.word	OPENSSL_armcap_P-sha256_block_data_order
208f2f770d7SSami Tolvanen#endif
209f2f770d7SSami Tolvanen.align	5
210f2f770d7SSami Tolvanen
211f2f770d7SSami Tolvanen.global	sha256_block_data_order
212f2f770d7SSami Tolvanen.type	sha256_block_data_order,%function
213f2f770d7SSami Tolvanensha256_block_data_order:
21469216a54SArd Biesheuvel.Lsha256_block_data_order:
215f2f770d7SSami Tolvanen#if __ARM_ARCH__<7
216f2f770d7SSami Tolvanen	sub	r3,pc,#8		@ sha256_block_data_order
217f2f770d7SSami Tolvanen#else
21869216a54SArd Biesheuvel	adr	r3,.Lsha256_block_data_order
219f2f770d7SSami Tolvanen#endif
220f2f770d7SSami Tolvanen#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221f2f770d7SSami Tolvanen	ldr	r12,.LOPENSSL_armcap
222f2f770d7SSami Tolvanen	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
223f2f770d7SSami Tolvanen	tst	r12,#ARMV8_SHA256
224f2f770d7SSami Tolvanen	bne	.LARMv8
225f2f770d7SSami Tolvanen	tst	r12,#ARMV7_NEON
226f2f770d7SSami Tolvanen	bne	.LNEON
227f2f770d7SSami Tolvanen#endif
228f2f770d7SSami Tolvanen	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
229f2f770d7SSami Tolvanen	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
230f2f770d7SSami Tolvanen	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231f2f770d7SSami Tolvanen	sub	$Ktbl,r3,#256+32	@ K256
232f2f770d7SSami Tolvanen	sub	sp,sp,#16*4		@ alloca(X[16])
233f2f770d7SSami Tolvanen.Loop:
234f2f770d7SSami Tolvanen# if __ARM_ARCH__>=7
235f2f770d7SSami Tolvanen	ldr	$t1,[$inp],#4
236f2f770d7SSami Tolvanen# else
237f2f770d7SSami Tolvanen	ldrb	$t1,[$inp,#3]
238f2f770d7SSami Tolvanen# endif
239f2f770d7SSami Tolvanen	eor	$t3,$B,$C		@ magic
240f2f770d7SSami Tolvanen	eor	$t2,$t2,$t2
241f2f770d7SSami Tolvanen___
242f2f770d7SSami Tolvanenfor($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243f2f770d7SSami Tolvanen$code.=".Lrounds_16_xx:\n";
244f2f770d7SSami Tolvanenfor (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
245f2f770d7SSami Tolvanen$code.=<<___;
246f2f770d7SSami Tolvanen#if __ARM_ARCH__>=7
247f2f770d7SSami Tolvanen	ite	eq			@ Thumb2 thing, sanity check in ARM
248f2f770d7SSami Tolvanen#endif
249f2f770d7SSami Tolvanen	ldreq	$t3,[sp,#16*4]		@ pull ctx
250f2f770d7SSami Tolvanen	bne	.Lrounds_16_xx
251f2f770d7SSami Tolvanen
252f2f770d7SSami Tolvanen	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
253f2f770d7SSami Tolvanen	ldr	$t0,[$t3,#0]
254f2f770d7SSami Tolvanen	ldr	$t1,[$t3,#4]
255f2f770d7SSami Tolvanen	ldr	$t2,[$t3,#8]
256f2f770d7SSami Tolvanen	add	$A,$A,$t0
257f2f770d7SSami Tolvanen	ldr	$t0,[$t3,#12]
258f2f770d7SSami Tolvanen	add	$B,$B,$t1
259f2f770d7SSami Tolvanen	ldr	$t1,[$t3,#16]
260f2f770d7SSami Tolvanen	add	$C,$C,$t2
261f2f770d7SSami Tolvanen	ldr	$t2,[$t3,#20]
262f2f770d7SSami Tolvanen	add	$D,$D,$t0
263f2f770d7SSami Tolvanen	ldr	$t0,[$t3,#24]
264f2f770d7SSami Tolvanen	add	$E,$E,$t1
265f2f770d7SSami Tolvanen	ldr	$t1,[$t3,#28]
266f2f770d7SSami Tolvanen	add	$F,$F,$t2
267f2f770d7SSami Tolvanen	ldr	$inp,[sp,#17*4]		@ pull inp
268f2f770d7SSami Tolvanen	ldr	$t2,[sp,#18*4]		@ pull inp+len
269f2f770d7SSami Tolvanen	add	$G,$G,$t0
270f2f770d7SSami Tolvanen	add	$H,$H,$t1
271f2f770d7SSami Tolvanen	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
272f2f770d7SSami Tolvanen	cmp	$inp,$t2
273f2f770d7SSami Tolvanen	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
274f2f770d7SSami Tolvanen	bne	.Loop
275f2f770d7SSami Tolvanen
276f2f770d7SSami Tolvanen	add	sp,sp,#`16+3`*4	@ destroy frame
277f2f770d7SSami Tolvanen#if __ARM_ARCH__>=5
278f2f770d7SSami Tolvanen	ldmia	sp!,{r4-r11,pc}
279f2f770d7SSami Tolvanen#else
280f2f770d7SSami Tolvanen	ldmia	sp!,{r4-r11,lr}
281f2f770d7SSami Tolvanen	tst	lr,#1
282f2f770d7SSami Tolvanen	moveq	pc,lr			@ be binary compatible with V4, yet
283f2f770d7SSami Tolvanen	bx	lr			@ interoperable with Thumb ISA:-)
284f2f770d7SSami Tolvanen#endif
285f2f770d7SSami Tolvanen.size	sha256_block_data_order,.-sha256_block_data_order
286f2f770d7SSami Tolvanen___
287f2f770d7SSami Tolvanen######################################################################
288f2f770d7SSami Tolvanen# NEON stuff
289f2f770d7SSami Tolvanen#
290f2f770d7SSami Tolvanen{{{
291f2f770d7SSami Tolvanenmy @X=map("q$_",(0..3));
292f2f770d7SSami Tolvanenmy ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
293f2f770d7SSami Tolvanenmy $Xfer=$t4;
294f2f770d7SSami Tolvanenmy $j=0;
295f2f770d7SSami Tolvanen
296f2f770d7SSami Tolvanensub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
297f2f770d7SSami Tolvanensub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
298f2f770d7SSami Tolvanen
299f2f770d7SSami Tolvanensub AUTOLOAD()          # thunk [simplified] x86-style perlasm
300f2f770d7SSami Tolvanen{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
301f2f770d7SSami Tolvanen  my $arg = pop;
302f2f770d7SSami Tolvanen    $arg = "#$arg" if ($arg*1 eq $arg);
303f2f770d7SSami Tolvanen    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
304f2f770d7SSami Tolvanen}
305f2f770d7SSami Tolvanen
306f2f770d7SSami Tolvanensub Xupdate()
307f2f770d7SSami Tolvanen{ use integer;
308f2f770d7SSami Tolvanen  my $body = shift;
309f2f770d7SSami Tolvanen  my @insns = (&$body,&$body,&$body,&$body);
310f2f770d7SSami Tolvanen  my ($a,$b,$c,$d,$e,$f,$g,$h);
311f2f770d7SSami Tolvanen
312f2f770d7SSami Tolvanen	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
313f2f770d7SSami Tolvanen	 eval(shift(@insns));
314f2f770d7SSami Tolvanen	 eval(shift(@insns));
315f2f770d7SSami Tolvanen	 eval(shift(@insns));
316f2f770d7SSami Tolvanen	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
317f2f770d7SSami Tolvanen	 eval(shift(@insns));
318f2f770d7SSami Tolvanen	 eval(shift(@insns));
319f2f770d7SSami Tolvanen	 eval(shift(@insns));
320f2f770d7SSami Tolvanen	&vshr_u32	($T2,$T0,$sigma0[0]);
321f2f770d7SSami Tolvanen	 eval(shift(@insns));
322f2f770d7SSami Tolvanen	 eval(shift(@insns));
323f2f770d7SSami Tolvanen	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
324f2f770d7SSami Tolvanen	 eval(shift(@insns));
325f2f770d7SSami Tolvanen	 eval(shift(@insns));
326f2f770d7SSami Tolvanen	&vshr_u32	($T1,$T0,$sigma0[2]);
327f2f770d7SSami Tolvanen	 eval(shift(@insns));
328f2f770d7SSami Tolvanen	 eval(shift(@insns));
329f2f770d7SSami Tolvanen	&vsli_32	($T2,$T0,32-$sigma0[0]);
330f2f770d7SSami Tolvanen	 eval(shift(@insns));
331f2f770d7SSami Tolvanen	 eval(shift(@insns));
332f2f770d7SSami Tolvanen	&vshr_u32	($T3,$T0,$sigma0[1]);
333f2f770d7SSami Tolvanen	 eval(shift(@insns));
334f2f770d7SSami Tolvanen	 eval(shift(@insns));
335f2f770d7SSami Tolvanen	&veor		($T1,$T1,$T2);
336f2f770d7SSami Tolvanen	 eval(shift(@insns));
337f2f770d7SSami Tolvanen	 eval(shift(@insns));
338f2f770d7SSami Tolvanen	&vsli_32	($T3,$T0,32-$sigma0[1]);
339f2f770d7SSami Tolvanen	 eval(shift(@insns));
340f2f770d7SSami Tolvanen	 eval(shift(@insns));
341f2f770d7SSami Tolvanen	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
342f2f770d7SSami Tolvanen	 eval(shift(@insns));
343f2f770d7SSami Tolvanen	 eval(shift(@insns));
344f2f770d7SSami Tolvanen	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
345f2f770d7SSami Tolvanen	 eval(shift(@insns));
346f2f770d7SSami Tolvanen	 eval(shift(@insns));
347f2f770d7SSami Tolvanen	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
348f2f770d7SSami Tolvanen	 eval(shift(@insns));
349f2f770d7SSami Tolvanen	 eval(shift(@insns));
350f2f770d7SSami Tolvanen	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
351f2f770d7SSami Tolvanen	 eval(shift(@insns));
352f2f770d7SSami Tolvanen	 eval(shift(@insns));
353f2f770d7SSami Tolvanen	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
354f2f770d7SSami Tolvanen	 eval(shift(@insns));
355f2f770d7SSami Tolvanen	 eval(shift(@insns));
356f2f770d7SSami Tolvanen	  &veor		($T5,$T5,$T4);
357f2f770d7SSami Tolvanen	 eval(shift(@insns));
358f2f770d7SSami Tolvanen	 eval(shift(@insns));
359f2f770d7SSami Tolvanen	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
360f2f770d7SSami Tolvanen	 eval(shift(@insns));
361f2f770d7SSami Tolvanen	 eval(shift(@insns));
362f2f770d7SSami Tolvanen	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
363f2f770d7SSami Tolvanen	 eval(shift(@insns));
364f2f770d7SSami Tolvanen	 eval(shift(@insns));
365f2f770d7SSami Tolvanen	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
366f2f770d7SSami Tolvanen	 eval(shift(@insns));
367f2f770d7SSami Tolvanen	 eval(shift(@insns));
368f2f770d7SSami Tolvanen	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
369f2f770d7SSami Tolvanen	 eval(shift(@insns));
370f2f770d7SSami Tolvanen	 eval(shift(@insns));
371f2f770d7SSami Tolvanen	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
372f2f770d7SSami Tolvanen	 eval(shift(@insns));
373f2f770d7SSami Tolvanen	 eval(shift(@insns));
374f2f770d7SSami Tolvanen	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
375f2f770d7SSami Tolvanen	 eval(shift(@insns));
376f2f770d7SSami Tolvanen	 eval(shift(@insns));
377f2f770d7SSami Tolvanen	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
378f2f770d7SSami Tolvanen	 eval(shift(@insns));
379f2f770d7SSami Tolvanen	 eval(shift(@insns));
380f2f770d7SSami Tolvanen	  &veor		($T5,$T5,$T4);
381f2f770d7SSami Tolvanen	 eval(shift(@insns));
382f2f770d7SSami Tolvanen	 eval(shift(@insns));
383f2f770d7SSami Tolvanen	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
384f2f770d7SSami Tolvanen	 eval(shift(@insns));
385f2f770d7SSami Tolvanen	 eval(shift(@insns));
386f2f770d7SSami Tolvanen	&vld1_32	("{$T0}","[$Ktbl,:128]!");
387f2f770d7SSami Tolvanen	 eval(shift(@insns));
388f2f770d7SSami Tolvanen	 eval(shift(@insns));
389f2f770d7SSami Tolvanen	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
390f2f770d7SSami Tolvanen	 eval(shift(@insns));
391f2f770d7SSami Tolvanen	 eval(shift(@insns));
392f2f770d7SSami Tolvanen	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
393f2f770d7SSami Tolvanen	 eval(shift(@insns));
394f2f770d7SSami Tolvanen	 eval(shift(@insns));
395f2f770d7SSami Tolvanen	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
396f2f770d7SSami Tolvanen	 eval(shift(@insns));
397f2f770d7SSami Tolvanen	 eval(shift(@insns));
398f2f770d7SSami Tolvanen	&vadd_i32	($T0,$T0,@X[0]);
399f2f770d7SSami Tolvanen	 while($#insns>=2) { eval(shift(@insns)); }
400f2f770d7SSami Tolvanen	&vst1_32	("{$T0}","[$Xfer,:128]!");
401f2f770d7SSami Tolvanen	 eval(shift(@insns));
402f2f770d7SSami Tolvanen	 eval(shift(@insns));
403f2f770d7SSami Tolvanen
404f2f770d7SSami Tolvanen	push(@X,shift(@X));		# "rotate" X[]
405f2f770d7SSami Tolvanen}
406f2f770d7SSami Tolvanen
407f2f770d7SSami Tolvanensub Xpreload()
408f2f770d7SSami Tolvanen{ use integer;
409f2f770d7SSami Tolvanen  my $body = shift;
410f2f770d7SSami Tolvanen  my @insns = (&$body,&$body,&$body,&$body);
411f2f770d7SSami Tolvanen  my ($a,$b,$c,$d,$e,$f,$g,$h);
412f2f770d7SSami Tolvanen
413f2f770d7SSami Tolvanen	 eval(shift(@insns));
414f2f770d7SSami Tolvanen	 eval(shift(@insns));
415f2f770d7SSami Tolvanen	 eval(shift(@insns));
416f2f770d7SSami Tolvanen	 eval(shift(@insns));
417f2f770d7SSami Tolvanen	&vld1_32	("{$T0}","[$Ktbl,:128]!");
418f2f770d7SSami Tolvanen	 eval(shift(@insns));
419f2f770d7SSami Tolvanen	 eval(shift(@insns));
420f2f770d7SSami Tolvanen	 eval(shift(@insns));
421f2f770d7SSami Tolvanen	 eval(shift(@insns));
422f2f770d7SSami Tolvanen	&vrev32_8	(@X[0],@X[0]);
423f2f770d7SSami Tolvanen	 eval(shift(@insns));
424f2f770d7SSami Tolvanen	 eval(shift(@insns));
425f2f770d7SSami Tolvanen	 eval(shift(@insns));
426f2f770d7SSami Tolvanen	 eval(shift(@insns));
427f2f770d7SSami Tolvanen	&vadd_i32	($T0,$T0,@X[0]);
428f2f770d7SSami Tolvanen	 foreach (@insns) { eval; }	# remaining instructions
429f2f770d7SSami Tolvanen	&vst1_32	("{$T0}","[$Xfer,:128]!");
430f2f770d7SSami Tolvanen
431f2f770d7SSami Tolvanen	push(@X,shift(@X));		# "rotate" X[]
432f2f770d7SSami Tolvanen}
433f2f770d7SSami Tolvanen
434f2f770d7SSami Tolvanensub body_00_15 () {
435f2f770d7SSami Tolvanen	(
436f2f770d7SSami Tolvanen	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437f2f770d7SSami Tolvanen	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
438f2f770d7SSami Tolvanen	'&eor	($t1,$f,$g)',
439f2f770d7SSami Tolvanen	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440f2f770d7SSami Tolvanen	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
441f2f770d7SSami Tolvanen	'&and	($t1,$t1,$e)',
442f2f770d7SSami Tolvanen	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
443f2f770d7SSami Tolvanen	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444f2f770d7SSami Tolvanen	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
445f2f770d7SSami Tolvanen	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
446f2f770d7SSami Tolvanen	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
447f2f770d7SSami Tolvanen	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
448f2f770d7SSami Tolvanen	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
449f2f770d7SSami Tolvanen	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
450f2f770d7SSami Tolvanen	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
451f2f770d7SSami Tolvanen	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
452f2f770d7SSami Tolvanen	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
453f2f770d7SSami Tolvanen	'&add	($d,$d,$h)',			# d+=h
454f2f770d7SSami Tolvanen	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
455f2f770d7SSami Tolvanen	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
456f2f770d7SSami Tolvanen	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
457f2f770d7SSami Tolvanen	)
458f2f770d7SSami Tolvanen}
459f2f770d7SSami Tolvanen
460f2f770d7SSami Tolvanen$code.=<<___;
461f2f770d7SSami Tolvanen#if __ARM_MAX_ARCH__>=7
462f2f770d7SSami Tolvanen.arch	armv7-a
463f2f770d7SSami Tolvanen.fpu	neon
464f2f770d7SSami Tolvanen
465f2f770d7SSami Tolvanen.global	sha256_block_data_order_neon
466f2f770d7SSami Tolvanen.type	sha256_block_data_order_neon,%function
467f2f770d7SSami Tolvanen.align	4
468f2f770d7SSami Tolvanensha256_block_data_order_neon:
469f2f770d7SSami Tolvanen.LNEON:
470f2f770d7SSami Tolvanen	stmdb	sp!,{r4-r12,lr}
471f2f770d7SSami Tolvanen
472f2f770d7SSami Tolvanen	sub	$H,sp,#16*4+16
473*54781938SArd Biesheuvel	adr	$Ktbl,.Lsha256_block_data_order
474*54781938SArd Biesheuvel	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
475f2f770d7SSami Tolvanen	bic	$H,$H,#15		@ align for 128-bit stores
476f2f770d7SSami Tolvanen	mov	$t2,sp
477f2f770d7SSami Tolvanen	mov	sp,$H			@ alloca
478f2f770d7SSami Tolvanen	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
479f2f770d7SSami Tolvanen
480f2f770d7SSami Tolvanen	vld1.8		{@X[0]},[$inp]!
481f2f770d7SSami Tolvanen	vld1.8		{@X[1]},[$inp]!
482f2f770d7SSami Tolvanen	vld1.8		{@X[2]},[$inp]!
483f2f770d7SSami Tolvanen	vld1.8		{@X[3]},[$inp]!
484f2f770d7SSami Tolvanen	vld1.32		{$T0},[$Ktbl,:128]!
485f2f770d7SSami Tolvanen	vld1.32		{$T1},[$Ktbl,:128]!
486f2f770d7SSami Tolvanen	vld1.32		{$T2},[$Ktbl,:128]!
487f2f770d7SSami Tolvanen	vld1.32		{$T3},[$Ktbl,:128]!
488f2f770d7SSami Tolvanen	vrev32.8	@X[0],@X[0]		@ yes, even on
489f2f770d7SSami Tolvanen	str		$ctx,[sp,#64]
490f2f770d7SSami Tolvanen	vrev32.8	@X[1],@X[1]		@ big-endian
491f2f770d7SSami Tolvanen	str		$inp,[sp,#68]
492f2f770d7SSami Tolvanen	mov		$Xfer,sp
493f2f770d7SSami Tolvanen	vrev32.8	@X[2],@X[2]
494f2f770d7SSami Tolvanen	str		$len,[sp,#72]
495f2f770d7SSami Tolvanen	vrev32.8	@X[3],@X[3]
496f2f770d7SSami Tolvanen	str		$t2,[sp,#76]		@ save original sp
497f2f770d7SSami Tolvanen	vadd.i32	$T0,$T0,@X[0]
498f2f770d7SSami Tolvanen	vadd.i32	$T1,$T1,@X[1]
499f2f770d7SSami Tolvanen	vst1.32		{$T0},[$Xfer,:128]!
500f2f770d7SSami Tolvanen	vadd.i32	$T2,$T2,@X[2]
501f2f770d7SSami Tolvanen	vst1.32		{$T1},[$Xfer,:128]!
502f2f770d7SSami Tolvanen	vadd.i32	$T3,$T3,@X[3]
503f2f770d7SSami Tolvanen	vst1.32		{$T2},[$Xfer,:128]!
504f2f770d7SSami Tolvanen	vst1.32		{$T3},[$Xfer,:128]!
505f2f770d7SSami Tolvanen
506f2f770d7SSami Tolvanen	ldmia		$ctx,{$A-$H}
507f2f770d7SSami Tolvanen	sub		$Xfer,$Xfer,#64
508f2f770d7SSami Tolvanen	ldr		$t1,[sp,#0]
509f2f770d7SSami Tolvanen	eor		$t2,$t2,$t2
510f2f770d7SSami Tolvanen	eor		$t3,$B,$C
511f2f770d7SSami Tolvanen	b		.L_00_48
512f2f770d7SSami Tolvanen
513f2f770d7SSami Tolvanen.align	4
514f2f770d7SSami Tolvanen.L_00_48:
515f2f770d7SSami Tolvanen___
516f2f770d7SSami Tolvanen	&Xupdate(\&body_00_15);
517f2f770d7SSami Tolvanen	&Xupdate(\&body_00_15);
518f2f770d7SSami Tolvanen	&Xupdate(\&body_00_15);
519f2f770d7SSami Tolvanen	&Xupdate(\&body_00_15);
520f2f770d7SSami Tolvanen$code.=<<___;
521f2f770d7SSami Tolvanen	teq	$t1,#0				@ check for K256 terminator
522f2f770d7SSami Tolvanen	ldr	$t1,[sp,#0]
523f2f770d7SSami Tolvanen	sub	$Xfer,$Xfer,#64
524f2f770d7SSami Tolvanen	bne	.L_00_48
525f2f770d7SSami Tolvanen
526f2f770d7SSami Tolvanen	ldr		$inp,[sp,#68]
527f2f770d7SSami Tolvanen	ldr		$t0,[sp,#72]
528f2f770d7SSami Tolvanen	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
529f2f770d7SSami Tolvanen	teq		$inp,$t0
530f2f770d7SSami Tolvanen	it		eq
531f2f770d7SSami Tolvanen	subeq		$inp,$inp,#64		@ avoid SEGV
532f2f770d7SSami Tolvanen	vld1.8		{@X[0]},[$inp]!		@ load next input block
533f2f770d7SSami Tolvanen	vld1.8		{@X[1]},[$inp]!
534f2f770d7SSami Tolvanen	vld1.8		{@X[2]},[$inp]!
535f2f770d7SSami Tolvanen	vld1.8		{@X[3]},[$inp]!
536f2f770d7SSami Tolvanen	it		ne
537f2f770d7SSami Tolvanen	strne		$inp,[sp,#68]
538f2f770d7SSami Tolvanen	mov		$Xfer,sp
539f2f770d7SSami Tolvanen___
540f2f770d7SSami Tolvanen	&Xpreload(\&body_00_15);
541f2f770d7SSami Tolvanen	&Xpreload(\&body_00_15);
542f2f770d7SSami Tolvanen	&Xpreload(\&body_00_15);
543f2f770d7SSami Tolvanen	&Xpreload(\&body_00_15);
544f2f770d7SSami Tolvanen$code.=<<___;
545f2f770d7SSami Tolvanen	ldr	$t0,[$t1,#0]
546f2f770d7SSami Tolvanen	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
547f2f770d7SSami Tolvanen	ldr	$t2,[$t1,#4]
548f2f770d7SSami Tolvanen	ldr	$t3,[$t1,#8]
549f2f770d7SSami Tolvanen	ldr	$t4,[$t1,#12]
550f2f770d7SSami Tolvanen	add	$A,$A,$t0			@ accumulate
551f2f770d7SSami Tolvanen	ldr	$t0,[$t1,#16]
552f2f770d7SSami Tolvanen	add	$B,$B,$t2
553f2f770d7SSami Tolvanen	ldr	$t2,[$t1,#20]
554f2f770d7SSami Tolvanen	add	$C,$C,$t3
555f2f770d7SSami Tolvanen	ldr	$t3,[$t1,#24]
556f2f770d7SSami Tolvanen	add	$D,$D,$t4
557f2f770d7SSami Tolvanen	ldr	$t4,[$t1,#28]
558f2f770d7SSami Tolvanen	add	$E,$E,$t0
559f2f770d7SSami Tolvanen	str	$A,[$t1],#4
560f2f770d7SSami Tolvanen	add	$F,$F,$t2
561f2f770d7SSami Tolvanen	str	$B,[$t1],#4
562f2f770d7SSami Tolvanen	add	$G,$G,$t3
563f2f770d7SSami Tolvanen	str	$C,[$t1],#4
564f2f770d7SSami Tolvanen	add	$H,$H,$t4
565f2f770d7SSami Tolvanen	str	$D,[$t1],#4
566f2f770d7SSami Tolvanen	stmia	$t1,{$E-$H}
567f2f770d7SSami Tolvanen
568f2f770d7SSami Tolvanen	ittte	ne
569f2f770d7SSami Tolvanen	movne	$Xfer,sp
570f2f770d7SSami Tolvanen	ldrne	$t1,[sp,#0]
571f2f770d7SSami Tolvanen	eorne	$t2,$t2,$t2
572f2f770d7SSami Tolvanen	ldreq	sp,[sp,#76]			@ restore original sp
573f2f770d7SSami Tolvanen	itt	ne
574f2f770d7SSami Tolvanen	eorne	$t3,$B,$C
575f2f770d7SSami Tolvanen	bne	.L_00_48
576f2f770d7SSami Tolvanen
577f2f770d7SSami Tolvanen	ldmia	sp!,{r4-r12,pc}
578f2f770d7SSami Tolvanen.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
579f2f770d7SSami Tolvanen#endif
580f2f770d7SSami Tolvanen___
581f2f770d7SSami Tolvanen}}}
582f2f770d7SSami Tolvanen######################################################################
583f2f770d7SSami Tolvanen# ARMv8 stuff
584f2f770d7SSami Tolvanen#
585f2f770d7SSami Tolvanen{{{
586f2f770d7SSami Tolvanenmy ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
587f2f770d7SSami Tolvanenmy @MSG=map("q$_",(8..11));
588f2f770d7SSami Tolvanenmy ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
589f2f770d7SSami Tolvanenmy $Ktbl="r3";
590f2f770d7SSami Tolvanen
591f2f770d7SSami Tolvanen$code.=<<___;
592f2f770d7SSami Tolvanen#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
593f2f770d7SSami Tolvanen
594f2f770d7SSami Tolvanen# ifdef __thumb2__
595f2f770d7SSami Tolvanen#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
596f2f770d7SSami Tolvanen# else
597f2f770d7SSami Tolvanen#  define INST(a,b,c,d)	.byte	a,b,c,d
598f2f770d7SSami Tolvanen# endif
599f2f770d7SSami Tolvanen
600f2f770d7SSami Tolvanen.type	sha256_block_data_order_armv8,%function
601f2f770d7SSami Tolvanen.align	5
602f2f770d7SSami Tolvanensha256_block_data_order_armv8:
603f2f770d7SSami Tolvanen.LARMv8:
604f2f770d7SSami Tolvanen	vld1.32	{$ABCD,$EFGH},[$ctx]
605f2f770d7SSami Tolvanen# ifdef __thumb2__
606f2f770d7SSami Tolvanen	adr	$Ktbl,.LARMv8
607f2f770d7SSami Tolvanen	sub	$Ktbl,$Ktbl,#.LARMv8-K256
608f2f770d7SSami Tolvanen# else
609f2f770d7SSami Tolvanen	adrl	$Ktbl,K256
610f2f770d7SSami Tolvanen# endif
611f2f770d7SSami Tolvanen	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
612f2f770d7SSami Tolvanen
613f2f770d7SSami Tolvanen.Loop_v8:
614f2f770d7SSami Tolvanen	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
615f2f770d7SSami Tolvanen	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
616f2f770d7SSami Tolvanen	vld1.32		{$W0},[$Ktbl]!
617f2f770d7SSami Tolvanen	vrev32.8	@MSG[0],@MSG[0]
618f2f770d7SSami Tolvanen	vrev32.8	@MSG[1],@MSG[1]
619f2f770d7SSami Tolvanen	vrev32.8	@MSG[2],@MSG[2]
620f2f770d7SSami Tolvanen	vrev32.8	@MSG[3],@MSG[3]
621f2f770d7SSami Tolvanen	vmov		$ABCD_SAVE,$ABCD	@ offload
622f2f770d7SSami Tolvanen	vmov		$EFGH_SAVE,$EFGH
623f2f770d7SSami Tolvanen	teq		$inp,$len
624f2f770d7SSami Tolvanen___
625f2f770d7SSami Tolvanenfor($i=0;$i<12;$i++) {
626f2f770d7SSami Tolvanen$code.=<<___;
627f2f770d7SSami Tolvanen	vld1.32		{$W1},[$Ktbl]!
628f2f770d7SSami Tolvanen	vadd.i32	$W0,$W0,@MSG[0]
629f2f770d7SSami Tolvanen	sha256su0	@MSG[0],@MSG[1]
630f2f770d7SSami Tolvanen	vmov		$abcd,$ABCD
631f2f770d7SSami Tolvanen	sha256h		$ABCD,$EFGH,$W0
632f2f770d7SSami Tolvanen	sha256h2	$EFGH,$abcd,$W0
633f2f770d7SSami Tolvanen	sha256su1	@MSG[0],@MSG[2],@MSG[3]
634f2f770d7SSami Tolvanen___
635f2f770d7SSami Tolvanen	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
636f2f770d7SSami Tolvanen}
637f2f770d7SSami Tolvanen$code.=<<___;
638f2f770d7SSami Tolvanen	vld1.32		{$W1},[$Ktbl]!
639f2f770d7SSami Tolvanen	vadd.i32	$W0,$W0,@MSG[0]
640f2f770d7SSami Tolvanen	vmov		$abcd,$ABCD
641f2f770d7SSami Tolvanen	sha256h		$ABCD,$EFGH,$W0
642f2f770d7SSami Tolvanen	sha256h2	$EFGH,$abcd,$W0
643f2f770d7SSami Tolvanen
644f2f770d7SSami Tolvanen	vld1.32		{$W0},[$Ktbl]!
645f2f770d7SSami Tolvanen	vadd.i32	$W1,$W1,@MSG[1]
646f2f770d7SSami Tolvanen	vmov		$abcd,$ABCD
647f2f770d7SSami Tolvanen	sha256h		$ABCD,$EFGH,$W1
648f2f770d7SSami Tolvanen	sha256h2	$EFGH,$abcd,$W1
649f2f770d7SSami Tolvanen
650f2f770d7SSami Tolvanen	vld1.32		{$W1},[$Ktbl]
651f2f770d7SSami Tolvanen	vadd.i32	$W0,$W0,@MSG[2]
652f2f770d7SSami Tolvanen	sub		$Ktbl,$Ktbl,#256-16	@ rewind
653f2f770d7SSami Tolvanen	vmov		$abcd,$ABCD
654f2f770d7SSami Tolvanen	sha256h		$ABCD,$EFGH,$W0
655f2f770d7SSami Tolvanen	sha256h2	$EFGH,$abcd,$W0
656f2f770d7SSami Tolvanen
657f2f770d7SSami Tolvanen	vadd.i32	$W1,$W1,@MSG[3]
658f2f770d7SSami Tolvanen	vmov		$abcd,$ABCD
659f2f770d7SSami Tolvanen	sha256h		$ABCD,$EFGH,$W1
660f2f770d7SSami Tolvanen	sha256h2	$EFGH,$abcd,$W1
661f2f770d7SSami Tolvanen
662f2f770d7SSami Tolvanen	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
663f2f770d7SSami Tolvanen	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
664f2f770d7SSami Tolvanen	it		ne
665f2f770d7SSami Tolvanen	bne		.Loop_v8
666f2f770d7SSami Tolvanen
667f2f770d7SSami Tolvanen	vst1.32		{$ABCD,$EFGH},[$ctx]
668f2f770d7SSami Tolvanen
669f2f770d7SSami Tolvanen	ret		@ bx lr
670f2f770d7SSami Tolvanen.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
671f2f770d7SSami Tolvanen#endif
672f2f770d7SSami Tolvanen___
673f2f770d7SSami Tolvanen}}}
674f2f770d7SSami Tolvanen$code.=<<___;
675f2f770d7SSami Tolvanen.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676f2f770d7SSami Tolvanen.align	2
677f2f770d7SSami Tolvanen#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
678f2f770d7SSami Tolvanen.comm   OPENSSL_armcap_P,4,4
679f2f770d7SSami Tolvanen#endif
680f2f770d7SSami Tolvanen___
681f2f770d7SSami Tolvanen
682f2f770d7SSami Tolvanenopen SELF,$0;
683f2f770d7SSami Tolvanenwhile(<SELF>) {
684f2f770d7SSami Tolvanen	next if (/^#!/);
685f2f770d7SSami Tolvanen	last if (!s/^#/@/ and !/^$/);
686f2f770d7SSami Tolvanen	print;
687f2f770d7SSami Tolvanen}
688f2f770d7SSami Tolvanenclose SELF;
689f2f770d7SSami Tolvanen
690f2f770d7SSami Tolvanen{   my  %opcode = (
691f2f770d7SSami Tolvanen	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
692f2f770d7SSami Tolvanen	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
693f2f770d7SSami Tolvanen
694f2f770d7SSami Tolvanen    sub unsha256 {
695f2f770d7SSami Tolvanen	my ($mnemonic,$arg)=@_;
696f2f770d7SSami Tolvanen
697f2f770d7SSami Tolvanen	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698f2f770d7SSami Tolvanen	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699f2f770d7SSami Tolvanen					 |(($2&7)<<17)|(($2&8)<<4)
700f2f770d7SSami Tolvanen					 |(($3&7)<<1) |(($3&8)<<2);
701f2f770d7SSami Tolvanen	    # since ARMv7 instructions are always encoded little-endian.
702f2f770d7SSami Tolvanen	    # correct solution is to use .inst directive, but older
703f2f770d7SSami Tolvanen	    # assemblers don't implement it:-(
704f2f770d7SSami Tolvanen	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705f2f770d7SSami Tolvanen			$word&0xff,($word>>8)&0xff,
706f2f770d7SSami Tolvanen			($word>>16)&0xff,($word>>24)&0xff,
707f2f770d7SSami Tolvanen			$mnemonic,$arg;
708f2f770d7SSami Tolvanen	}
709f2f770d7SSami Tolvanen    }
710f2f770d7SSami Tolvanen}
711f2f770d7SSami Tolvanen
712f2f770d7SSami Tolvanenforeach (split($/,$code)) {
713f2f770d7SSami Tolvanen
714f2f770d7SSami Tolvanen	s/\`([^\`]*)\`/eval $1/geo;
715f2f770d7SSami Tolvanen
716f2f770d7SSami Tolvanen	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717f2f770d7SSami Tolvanen
718f2f770d7SSami Tolvanen	s/\bret\b/bx	lr/go		or
719f2f770d7SSami Tolvanen	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
720f2f770d7SSami Tolvanen
721f2f770d7SSami Tolvanen	print $_,"\n";
722f2f770d7SSami Tolvanen}
723f2f770d7SSami Tolvanen
724f2f770d7SSami Tolvanenclose STDOUT; # enforce flush
725