1#! /usr/bin/env perl
2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# SHA1 for C64x+.
18#
19# November 2011
20#
21# If compared to compiler-generated code with similar characteristics,
22# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
23# this implementation is 25% smaller and >2x faster. In absolute terms
24# performance is (quite impressive) ~6.5 cycles per processed byte.
25# Fully unrolled assembler would be ~5x larger and is likely to be
26# ~15% faster. It would be free from references to intermediate ring
27# buffer, but put more pressure on L1P [both because the code would be
28# larger and won't be using SPLOOP buffer]. There are no plans to
29# realize fully unrolled variant though...
30#
31# !!! Note that this module uses AMR, which means that all interrupt
32# service routines are expected to preserve it and for own well-being
33# zero it upon entry.
34
35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
36open STDOUT,">$output";
37
38($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
39
40($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
41($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
42($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
43($XPA,$XPB) = ("A5","B5");			# X circular buffer
44($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
45
46$code=<<___;
47	.text
48
49	.if	.ASSEMBLER_VERSION<7000000
50	.asg	0,__TI_EABI__
51	.endif
52	.if	__TI_EABI__
53	.asg	sha1_block_data_order,_sha1_block_data_order
54	.endif
55
56	.asg	B3,RA
57	.asg	A15,FP
58	.asg	B15,SP
59
60	.if	.BIG_ENDIAN
61	.asg	MV,SWAP2
62	.asg	MV,SWAP4
63	.endif
64
65	.global	_sha1_block_data_order
66_sha1_block_data_order:
67	.asmfunc stack_usage(64)
68	MV	$NUM,A0			; reassign $NUM
69||	MVK	-64,B0
70  [!A0]	BNOP	RA			; if ($NUM==0) return;
71|| [A0]	STW	FP,*SP--[16]		; save frame pointer and alloca(64)
72|| [A0]	MV	SP,FP
73   [A0]	LDW	*${CTX}[0],$A		; load A-E...
74|| [A0]	AND	B0,SP,SP		; align stack at 64 bytes
75   [A0]	LDW	*${CTX}[1],$B
76|| [A0]	SUBAW	SP,2,SP			; reserve two words above buffer
77   [A0]	LDW	*${CTX}[2],$C
78|| [A0]	MVK	0x00404,B0
79   [A0]	LDW	*${CTX}[3],$D
80|| [A0]	MVKH	0x50000,B0		; 0x050404, 64 bytes for $XP[AB]
81   [A0]	LDW	*${CTX}[4],$E
82|| [A0]	MVC	B0,AMR			; setup circular addressing
83	LDNW	*${INP}++,$TX1		; pre-fetch input
84	NOP	1
85
86loop?:
87	MVK	0x00007999,$K
88||	ADDAW	SP,2,$XPA
89||	SUB	A0,1,A0
90||	MVK	13,B0
91	MVKH	0x5a820000,$K		; K_00_19
92||	ADDAW	SP,2,$XPB
93||	MV	$A,$Actx
94||	MV	$B,$Bctx
95;;==================================================
96	SPLOOPD	5			; BODY_00_13
97||	MV	$C,$Cctx
98||	MV	$D,$Dctx
99||	MV	$E,$Ectx
100||	MVC	B0,ILC
101
102	ROTL	$A,5,$Arot
103||	AND	$C,$B,$F
104||	ANDN	$D,$B,$F0
105||	ADD	$K,$E,$T		; T=E+K
106
107	XOR	$F0,$F,$F		; F_00_19(B,C,D)
108||	MV	$D,$E			; E=D
109||	MV	$C,$D			; D=C
110||	SWAP2	$TX1,$TX2
111||	LDNW	*${INP}++,$TX1
112
113	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
114||	ROTL	$B,30,$C		; C=ROL(B,30)
115||	SWAP4	$TX2,$TX3		; byte swap
116
117	ADD	$Arot,$T,$T		; T+=ROL(A,5)
118||	MV	$A,$B			; B=A
119
120	ADD	$TX3,$T,$A		; A=T+Xi
121||	STW	$TX3,*${XPB}++
122	SPKERNEL
123;;==================================================
124	ROTL	$A,5,$Arot		; BODY_14
125||	AND	$C,$B,$F
126||	ANDN	$D,$B,$F0
127||	ADD	$K,$E,$T		; T=E+K
128
129	XOR	$F0,$F,$F		; F_00_19(B,C,D)
130||	MV	$D,$E			; E=D
131||	MV	$C,$D			; D=C
132||	SWAP2	$TX1,$TX2
133||	LDNW	*${INP}++,$TX1
134
135	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
136||	ROTL	$B,30,$C		; C=ROL(B,30)
137||	SWAP4	$TX2,$TX2		; byte swap
138||	LDW	*${XPA}++,$X0		; fetches from X ring buffer are
139||	LDW	*${XPB}[4],$X2		; 2 iterations ahead
140
141	ADD	$Arot,$T,$T		; T+=ROL(A,5)
142||	MV	$A,$B			; B=A
143||	LDW	*${XPA}[7],$X8
144||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
145||	MV	$TX2,$TX3
146
147	ADD	$TX2,$T,$A		; A=T+Xi
148||	STW	$TX2,*${XPB}++
149;;==================================================
150	ROTL	$A,5,$Arot		; BODY_15
151||	AND	$C,$B,$F
152||	ANDN	$D,$B,$F0
153||	ADD	$K,$E,$T		; T=E+K
154
155	XOR	$F0,$F,$F		; F_00_19(B,C,D)
156||	MV	$D,$E			; E=D
157||	MV	$C,$D			; D=C
158||	SWAP2	$TX1,$TX2
159
160	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
161||	ROTL	$B,30,$C		; C=ROL(B,30)
162||	SWAP4	$TX2,$TX2		; byte swap
163||	XOR	$X0,$X2,$TX0		; Xupdate XORs are 1 iteration ahead
164||	LDW	*${XPA}++,$X0
165||	LDW	*${XPB}[4],$X2
166
167	ADD	$Arot,$T,$T		; T+=ROL(A,5)
168||	MV	$A,$B			; B=A
169||	XOR	$X8,$X13,$TX1
170||	LDW	*${XPA}[7],$X8
171||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
172||	MV	$TX2,$TX3
173
174	ADD	$TX2,$T,$A		; A=T+Xi
175||	STW	$TX2,*${XPB}++
176||	XOR	$TX0,$TX1,$TX1
177||	MVK	3,B0
178;;==================================================
179	SPLOOPD	5			; BODY_16_19
180||	MVC	B0,ILC
181
182	ROTL	$A,5,$Arot
183||	AND	$C,$B,$F
184||	ANDN	$D,$B,$F0
185||	ADD	$K,$E,$T		; T=E+K
186||	ROTL	$TX1,1,$TX2		; Xupdate output
187
188	XOR	$F0,$F,$F		; F_00_19(B,C,D)
189||	MV	$D,$E			; E=D
190||	MV	$C,$D			; D=C
191
192	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
193||	ROTL	$B,30,$C		; C=ROL(B,30)
194||	XOR	$X0,$X2,$TX0
195||	LDW	*${XPA}++,$X0
196||	LDW	*${XPB}[4],$X2
197
198	ADD	$Arot,$T,$T		; T+=ROL(A,5)
199||	MV	$A,$B			; B=A
200||	XOR	$X8,$X13,$TX1
201||	LDW	*${XPA}[7],$X8
202||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
203||	MV	$TX2,$TX3
204
205	ADD	$TX2,$T,$A		; A=T+Xi
206||	STW	$TX2,*${XPB}++
207||	XOR	$TX0,$TX1,$TX1
208	SPKERNEL
209
210	MVK	0xffffeba1,$K
211||	MVK	19,B0
212	MVKH	0x6ed90000,$K		; K_20_39
213___
214sub BODY_20_39 {
215$code.=<<___;
216;;==================================================
217	SPLOOPD	5			; BODY_20_39
218||	MVC	B0,ILC
219
220	ROTL	$A,5,$Arot
221||	XOR	$B,$C,$F
222||	ADD	$K,$E,$T		; T=E+K
223||	ROTL	$TX1,1,$TX2		; Xupdate output
224
225	XOR	$D,$F,$F		; F_20_39(B,C,D)
226||	MV	$D,$E			; E=D
227||	MV	$C,$D			; D=C
228
229	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
230||	ROTL	$B,30,$C		; C=ROL(B,30)
231||	XOR	$X0,$X2,$TX0
232||	LDW	*${XPA}++,$X0
233||	LDW	*${XPB}[4],$X2
234
235	ADD	$Arot,$T,$T		; T+=ROL(A,5)
236||	MV	$A,$B			; B=A
237||	XOR	$X8,$X13,$TX1
238||	LDW	*${XPA}[7],$X8
239||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
240||	MV	$TX2,$TX3
241
242	ADD	$TX2,$T,$A		; A=T+Xi
243||	STW	$TX2,*${XPB}++		; last one is redundant
244||	XOR	$TX0,$TX1,$TX1
245	SPKERNEL
246___
247$code.=<<___ if (!shift);
248	MVK	0xffffbcdc,$K
249	MVKH	0x8f1b0000,$K		; K_40_59
250___
251}	&BODY_20_39();
252$code.=<<___;
253;;==================================================
254	SPLOOPD	5			; BODY_40_59
255||	MVC	B0,ILC
256||	AND	$B,$C,$F
257||	AND	$B,$D,$F0
258
259	ROTL	$A,5,$Arot
260||	XOR	$F0,$F,$F
261||	AND	$C,$D,$F0
262||	ADD	$K,$E,$T		; T=E+K
263||	ROTL	$TX1,1,$TX2		; Xupdate output
264
265	XOR	$F0,$F,$F		; F_40_59(B,C,D)
266||	MV	$D,$E			; E=D
267||	MV	$C,$D			; D=C
268
269	ADD	$F,$T,$T		; T+=F_40_59(B,C,D)
270||	ROTL	$B,30,$C		; C=ROL(B,30)
271||	XOR	$X0,$X2,$TX0
272||	LDW	*${XPA}++,$X0
273||	LDW	*${XPB}[4],$X2
274
275	ADD	$Arot,$T,$T		; T+=ROL(A,5)
276||	MV	$A,$B			; B=A
277||	XOR	$X8,$X13,$TX1
278||	LDW	*${XPA}[7],$X8
279||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
280||	MV	$TX2,$TX3
281
282	ADD	$TX2,$T,$A		; A=T+Xi
283||	STW	$TX2,*${XPB}++
284||	XOR	$TX0,$TX1,$TX1
285||	AND	$B,$C,$F
286||	AND	$B,$D,$F0
287	SPKERNEL
288
289	MVK	0xffffc1d6,$K
290||	MVK	18,B0
291	MVKH	0xca620000,$K		; K_60_79
292___
293	&BODY_20_39(-1);		# BODY_60_78
294$code.=<<___;
295;;==================================================
296   [A0]	B	loop?
297||	ROTL	$A,5,$Arot		; BODY_79
298||	XOR	$B,$C,$F
299||	ROTL	$TX1,1,$TX2		; Xupdate output
300
301   [A0]	LDNW	*${INP}++,$TX1		; pre-fetch input
302||	ADD	$K,$E,$T		; T=E+K
303||	XOR	$D,$F,$F		; F_20_39(B,C,D)
304
305	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
306||	ADD	$Ectx,$D,$E		; E=D,E+=Ectx
307||	ADD	$Dctx,$C,$D		; D=C,D+=Dctx
308||	ROTL	$B,30,$C		; C=ROL(B,30)
309
310	ADD	$Arot,$T,$T		; T+=ROL(A,5)
311||	ADD	$Bctx,$A,$B		; B=A,B+=Bctx
312
313	ADD	$TX2,$T,$A		; A=T+Xi
314
315	ADD	$Actx,$A,$A		; A+=Actx
316||	ADD	$Cctx,$C,$C		; C+=Cctx
317;; end of loop?
318
319	BNOP	RA			; return
320||	MV	FP,SP			; restore stack pointer
321||	LDW	*FP[0],FP		; restore frame pointer
322	STW	$A,*${CTX}[0]		; emit A-E...
323||	MVK	0,B0
324	STW	$B,*${CTX}[1]
325||	MVC	B0,AMR			; clear AMR
326	STW	$C,*${CTX}[2]
327	STW	$D,*${CTX}[3]
328	STW	$E,*${CTX}[4]
329	.endasmfunc
330
331	.sect	.const
332	.cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
333	.align	4
334___
335
336print $code;
337close STDOUT;
338