1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for PPC64.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation that works on
21# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
22# it's possible to achieve performance better than below, but that is
23# naturally option only for POWER8 and successors...
24#
25######################################################################
26# Numbers are cycles per processed byte.
27#
28#		r=1088(*)
29#
30# PPC970/G5	14.6/+120%
31# POWER7	10.3/+100%
32# POWER8	11.5/+85%
33# POWER9	9.4/+45%
34#
35# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
36#	over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
37#	much better (but watch out for them generating code specific
38#	to processor they execute on).
39
40$flavour = shift;
41
42if ($flavour =~ /64/) {
43	$SIZE_T	=8;
44	$LRSAVE	=2*$SIZE_T;
45	$UCMP	="cmpld";
46	$STU	="stdu";
47	$POP	="ld";
48	$PUSH	="std";
49} else { die "nonsense $flavour"; }
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
53( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
54die "can't locate ppc-xlate.pl";
55
56open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
57
58$FRAME=24*$SIZE_T+6*$SIZE_T+32;
59$LOCALS=6*$SIZE_T;
60$TEMP=$LOCALS+6*$SIZE_T;
61
62my $sp ="r1";
63
64my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
65            (7, 12, 17, 22, 27));
66   $A[1][1] = "r6"; # r13 is reserved
67
68my @C = map("r$_", (0,3,4,5));
69
70my @rhotates = ([  0,  1, 62, 28, 27 ],
71                [ 36, 44,  6, 55, 20 ],
72                [  3, 10, 43, 25, 39 ],
73                [ 41, 45, 15, 21,  8 ],
74                [ 18,  2, 61, 56, 14 ]);
75
76$code.=<<___;
77.text
78
79.type	KeccakF1600_int,\@function
80.align	5
81KeccakF1600_int:
82	li	r0,24
83	mtctr	r0
84	b	.Loop
85.align	4
86.Loop:
87	xor	$C[0],$A[0][0],$A[1][0]		; Theta
88	std	$A[0][4],`$TEMP+0`($sp)
89	xor	$C[1],$A[0][1],$A[1][1]
90	std	$A[1][4],`$TEMP+8`($sp)
91	xor	$C[2],$A[0][2],$A[1][2]
92	std	$A[2][4],`$TEMP+16`($sp)
93	xor	$C[3],$A[0][3],$A[1][3]
94	std	$A[3][4],`$TEMP+24`($sp)
95___
96	$C[4]=$A[0][4];
97	$C[5]=$A[1][4];
98	$C[6]=$A[2][4];
99	$C[7]=$A[3][4];
100$code.=<<___;
101	xor	$C[4],$A[0][4],$A[1][4]
102	xor	$C[0],$C[0],$A[2][0]
103	xor	$C[1],$C[1],$A[2][1]
104	xor	$C[2],$C[2],$A[2][2]
105	xor	$C[3],$C[3],$A[2][3]
106	xor	$C[4],$C[4],$A[2][4]
107	xor	$C[0],$C[0],$A[3][0]
108	xor	$C[1],$C[1],$A[3][1]
109	xor	$C[2],$C[2],$A[3][2]
110	xor	$C[3],$C[3],$A[3][3]
111	xor	$C[4],$C[4],$A[3][4]
112	xor	$C[0],$C[0],$A[4][0]
113	xor	$C[2],$C[2],$A[4][2]
114	xor	$C[1],$C[1],$A[4][1]
115	xor	$C[3],$C[3],$A[4][3]
116	rotldi	$C[5],$C[2],1
117	xor	$C[4],$C[4],$A[4][4]
118	rotldi	$C[6],$C[3],1
119	xor	$C[5],$C[5],$C[0]
120	rotldi	$C[7],$C[4],1
121
122	xor	$A[0][1],$A[0][1],$C[5]
123	xor	$A[1][1],$A[1][1],$C[5]
124	xor	$A[2][1],$A[2][1],$C[5]
125	xor	$A[3][1],$A[3][1],$C[5]
126	xor	$A[4][1],$A[4][1],$C[5]
127
128	rotldi	$C[5],$C[0],1
129	xor	$C[6],$C[6],$C[1]
130	xor	$C[2],$C[2],$C[7]
131	rotldi	$C[7],$C[1],1
132	xor	$C[3],$C[3],$C[5]
133	xor	$C[4],$C[4],$C[7]
134
135	xor	$C[1],   $A[0][2],$C[6]			;mr	$C[1],$A[0][2]
136	xor	$A[1][2],$A[1][2],$C[6]
137	xor	$A[2][2],$A[2][2],$C[6]
138	xor	$A[3][2],$A[3][2],$C[6]
139	xor	$A[4][2],$A[4][2],$C[6]
140
141	xor	$A[0][0],$A[0][0],$C[4]
142	xor	$A[1][0],$A[1][0],$C[4]
143	xor	$A[2][0],$A[2][0],$C[4]
144	xor	$A[3][0],$A[3][0],$C[4]
145	xor	$A[4][0],$A[4][0],$C[4]
146___
147	$C[4]=undef;
148	$C[5]=undef;
149	$C[6]=undef;
150	$C[7]=undef;
151$code.=<<___;
152	ld	$A[0][4],`$TEMP+0`($sp)
153	xor	$C[0],   $A[0][3],$C[2]			;mr	$C[0],$A[0][3]
154	ld	$A[1][4],`$TEMP+8`($sp)
155	xor	$A[1][3],$A[1][3],$C[2]
156	ld	$A[2][4],`$TEMP+16`($sp)
157	xor	$A[2][3],$A[2][3],$C[2]
158	ld	$A[3][4],`$TEMP+24`($sp)
159	xor	$A[3][3],$A[3][3],$C[2]
160	xor	$A[4][3],$A[4][3],$C[2]
161
162	xor	$C[2],   $A[0][4],$C[3]			;mr	$C[2],$A[0][4]
163	xor	$A[1][4],$A[1][4],$C[3]
164	xor	$A[2][4],$A[2][4],$C[3]
165	xor	$A[3][4],$A[3][4],$C[3]
166	xor	$A[4][4],$A[4][4],$C[3]
167
168	mr	$C[3],$A[0][1]				; Rho+Pi
169	rotldi	$A[0][1],$A[1][1],$rhotates[1][1]
170	;mr	$C[1],$A[0][2]
171	rotldi	$A[0][2],$A[2][2],$rhotates[2][2]
172	;mr	$C[0],$A[0][3]
173	rotldi	$A[0][3],$A[3][3],$rhotates[3][3]
174	;mr	$C[2],$A[0][4]
175	rotldi	$A[0][4],$A[4][4],$rhotates[4][4]
176
177	rotldi	$A[1][1],$A[1][4],$rhotates[1][4]
178	rotldi	$A[2][2],$A[2][3],$rhotates[2][3]
179	rotldi	$A[3][3],$A[3][2],$rhotates[3][2]
180	rotldi	$A[4][4],$A[4][1],$rhotates[4][1]
181
182	rotldi	$A[1][4],$A[4][2],$rhotates[4][2]
183	rotldi	$A[2][3],$A[3][4],$rhotates[3][4]
184	rotldi	$A[3][2],$A[2][1],$rhotates[2][1]
185	rotldi	$A[4][1],$A[1][3],$rhotates[1][3]
186
187	rotldi	$A[4][2],$A[2][4],$rhotates[2][4]
188	rotldi	$A[3][4],$A[4][3],$rhotates[4][3]
189	rotldi	$A[2][1],$A[1][2],$rhotates[1][2]
190	rotldi	$A[1][3],$A[3][1],$rhotates[3][1]
191
192	rotldi	$A[2][4],$A[4][0],$rhotates[4][0]
193	rotldi	$A[4][3],$A[3][0],$rhotates[3][0]
194	rotldi	$A[1][2],$A[2][0],$rhotates[2][0]
195	rotldi	$A[3][1],$A[1][0],$rhotates[1][0]
196
197	rotldi	$A[1][0],$C[0],$rhotates[0][3]
198	rotldi	$A[2][0],$C[3],$rhotates[0][1]
199	rotldi	$A[3][0],$C[2],$rhotates[0][4]
200	rotldi	$A[4][0],$C[1],$rhotates[0][2]
201
202	andc	$C[0],$A[0][2],$A[0][1]			; Chi+Iota
203	andc	$C[1],$A[0][3],$A[0][2]
204	andc	$C[2],$A[0][0],$A[0][4]
205	andc	$C[3],$A[0][1],$A[0][0]
206	xor	$A[0][0],$A[0][0],$C[0]
207	andc	$C[0],$A[0][4],$A[0][3]
208	xor	$A[0][1],$A[0][1],$C[1]
209	 ld	$C[1],`$LOCALS+4*$SIZE_T`($sp)
210	xor	$A[0][3],$A[0][3],$C[2]
211	xor	$A[0][4],$A[0][4],$C[3]
212	xor	$A[0][2],$A[0][2],$C[0]
213	 ldu	$C[3],8($C[1])				; Iota[i++]
214
215	andc	$C[0],$A[1][2],$A[1][1]
216	 std	$C[1],`$LOCALS+4*$SIZE_T`($sp)
217	andc	$C[1],$A[1][3],$A[1][2]
218	andc	$C[2],$A[1][0],$A[1][4]
219	 xor	$A[0][0],$A[0][0],$C[3]			; A[0][0] ^= Iota
220	andc	$C[3],$A[1][1],$A[1][0]
221	xor	$A[1][0],$A[1][0],$C[0]
222	andc	$C[0],$A[1][4],$A[1][3]
223	xor	$A[1][1],$A[1][1],$C[1]
224	xor	$A[1][3],$A[1][3],$C[2]
225	xor	$A[1][4],$A[1][4],$C[3]
226	xor	$A[1][2],$A[1][2],$C[0]
227
228	andc	$C[0],$A[2][2],$A[2][1]
229	andc	$C[1],$A[2][3],$A[2][2]
230	andc	$C[2],$A[2][0],$A[2][4]
231	andc	$C[3],$A[2][1],$A[2][0]
232	xor	$A[2][0],$A[2][0],$C[0]
233	andc	$C[0],$A[2][4],$A[2][3]
234	xor	$A[2][1],$A[2][1],$C[1]
235	xor	$A[2][3],$A[2][3],$C[2]
236	xor	$A[2][4],$A[2][4],$C[3]
237	xor	$A[2][2],$A[2][2],$C[0]
238
239	andc	$C[0],$A[3][2],$A[3][1]
240	andc	$C[1],$A[3][3],$A[3][2]
241	andc	$C[2],$A[3][0],$A[3][4]
242	andc	$C[3],$A[3][1],$A[3][0]
243	xor	$A[3][0],$A[3][0],$C[0]
244	andc	$C[0],$A[3][4],$A[3][3]
245	xor	$A[3][1],$A[3][1],$C[1]
246	xor	$A[3][3],$A[3][3],$C[2]
247	xor	$A[3][4],$A[3][4],$C[3]
248	xor	$A[3][2],$A[3][2],$C[0]
249
250	andc	$C[0],$A[4][2],$A[4][1]
251	andc	$C[1],$A[4][3],$A[4][2]
252	andc	$C[2],$A[4][0],$A[4][4]
253	andc	$C[3],$A[4][1],$A[4][0]
254	xor	$A[4][0],$A[4][0],$C[0]
255	andc	$C[0],$A[4][4],$A[4][3]
256	xor	$A[4][1],$A[4][1],$C[1]
257	xor	$A[4][3],$A[4][3],$C[2]
258	xor	$A[4][4],$A[4][4],$C[3]
259	xor	$A[4][2],$A[4][2],$C[0]
260
261	bdnz	.Loop
262
263	blr
264	.long	0
265	.byte	0,12,0x14,0,0,0,0,0
266.size	KeccakF1600_int,.-KeccakF1600_int
267
268.type	KeccakF1600,\@function
269.align	5
270KeccakF1600:
271	$STU	$sp,-$FRAME($sp)
272	mflr	r0
273	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
274	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
275	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
276	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
277	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
278	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
279	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
280	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
281	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
282	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
283	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
284	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
285	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
286	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
287	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
288	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
289	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
290	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
291	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
292
293	bl	PICmeup
294	subi	r12,r12,8			; prepare for ldu
295
296	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)
297	;$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)
298	;$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)
299	;$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)
300	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
301
302	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
303	ld	$A[0][1],`8*1`(r3)
304	ld	$A[0][2],`8*2`(r3)
305	ld	$A[0][3],`8*3`(r3)
306	ld	$A[0][4],`8*4`(r3)
307	ld	$A[1][0],`8*5`(r3)
308	ld	$A[1][1],`8*6`(r3)
309	ld	$A[1][2],`8*7`(r3)
310	ld	$A[1][3],`8*8`(r3)
311	ld	$A[1][4],`8*9`(r3)
312	ld	$A[2][0],`8*10`(r3)
313	ld	$A[2][1],`8*11`(r3)
314	ld	$A[2][2],`8*12`(r3)
315	ld	$A[2][3],`8*13`(r3)
316	ld	$A[2][4],`8*14`(r3)
317	ld	$A[3][0],`8*15`(r3)
318	ld	$A[3][1],`8*16`(r3)
319	ld	$A[3][2],`8*17`(r3)
320	ld	$A[3][3],`8*18`(r3)
321	ld	$A[3][4],`8*19`(r3)
322	ld	$A[4][0],`8*20`(r3)
323	ld	$A[4][1],`8*21`(r3)
324	ld	$A[4][2],`8*22`(r3)
325	ld	$A[4][3],`8*23`(r3)
326	ld	$A[4][4],`8*24`(r3)
327
328	bl	KeccakF1600_int
329
330	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
331	std	$A[0][0],`8*0`(r3)		; return A[5][5]
332	std	$A[0][1],`8*1`(r3)
333	std	$A[0][2],`8*2`(r3)
334	std	$A[0][3],`8*3`(r3)
335	std	$A[0][4],`8*4`(r3)
336	std	$A[1][0],`8*5`(r3)
337	std	$A[1][1],`8*6`(r3)
338	std	$A[1][2],`8*7`(r3)
339	std	$A[1][3],`8*8`(r3)
340	std	$A[1][4],`8*9`(r3)
341	std	$A[2][0],`8*10`(r3)
342	std	$A[2][1],`8*11`(r3)
343	std	$A[2][2],`8*12`(r3)
344	std	$A[2][3],`8*13`(r3)
345	std	$A[2][4],`8*14`(r3)
346	std	$A[3][0],`8*15`(r3)
347	std	$A[3][1],`8*16`(r3)
348	std	$A[3][2],`8*17`(r3)
349	std	$A[3][3],`8*18`(r3)
350	std	$A[3][4],`8*19`(r3)
351	std	$A[4][0],`8*20`(r3)
352	std	$A[4][1],`8*21`(r3)
353	std	$A[4][2],`8*22`(r3)
354	std	$A[4][3],`8*23`(r3)
355	std	$A[4][4],`8*24`(r3)
356
357	$POP	r0,`$FRAME+$LRSAVE`($sp)
358	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
359	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
360	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
361	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
362	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
363	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
364	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
365	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
366	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
367	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
368	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
369	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
370	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
371	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
372	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
373	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
374	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
375	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
376	mtlr	r0
377	addi	$sp,$sp,$FRAME
378	blr
379	.long	0
380	.byte	0,12,4,1,0x80,18,1,0
381	.long	0
382.size	KeccakF1600,.-KeccakF1600
383
384.type	dword_le_load,\@function
385.align	5
386dword_le_load:
387	lbzu	r0,1(r3)
388	lbzu	r4,1(r3)
389	lbzu	r5,1(r3)
390	insrdi	r0,r4,8,48
391	lbzu	r4,1(r3)
392	insrdi	r0,r5,8,40
393	lbzu	r5,1(r3)
394	insrdi	r0,r4,8,32
395	lbzu	r4,1(r3)
396	insrdi	r0,r5,8,24
397	lbzu	r5,1(r3)
398	insrdi	r0,r4,8,16
399	lbzu	r4,1(r3)
400	insrdi	r0,r5,8,8
401	insrdi	r0,r4,8,0
402	blr
403	.long	0
404	.byte	0,12,0x14,0,0,0,1,0
405	.long	0
406.size	dword_le_load,.-dword_le_load
407
408.globl	SHA3_absorb
409.type	SHA3_absorb,\@function
410.align	5
411SHA3_absorb:
412	$STU	$sp,-$FRAME($sp)
413	mflr	r0
414	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
415	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
416	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
417	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
418	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
419	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
420	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
421	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
422	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
423	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
424	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
425	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
426	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
427	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
428	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
429	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
430	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
431	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
432	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
433
434	bl	PICmeup
435	subi	r4,r4,1				; prepare for lbzu
436	subi	r12,r12,8			; prepare for ldu
437
438	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)	; save A[][]
439	$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)	; save inp
440	$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)	; save len
441	$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)	; save bsz
442	mr	r0,r6
443	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
444
445	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
446	ld	$A[0][1],`8*1`(r3)
447	ld	$A[0][2],`8*2`(r3)
448	ld	$A[0][3],`8*3`(r3)
449	ld	$A[0][4],`8*4`(r3)
450	ld	$A[1][0],`8*5`(r3)
451	ld	$A[1][1],`8*6`(r3)
452	ld	$A[1][2],`8*7`(r3)
453	ld	$A[1][3],`8*8`(r3)
454	ld	$A[1][4],`8*9`(r3)
455	ld	$A[2][0],`8*10`(r3)
456	ld	$A[2][1],`8*11`(r3)
457	ld	$A[2][2],`8*12`(r3)
458	ld	$A[2][3],`8*13`(r3)
459	ld	$A[2][4],`8*14`(r3)
460	ld	$A[3][0],`8*15`(r3)
461	ld	$A[3][1],`8*16`(r3)
462	ld	$A[3][2],`8*17`(r3)
463	ld	$A[3][3],`8*18`(r3)
464	ld	$A[3][4],`8*19`(r3)
465	ld	$A[4][0],`8*20`(r3)
466	ld	$A[4][1],`8*21`(r3)
467	ld	$A[4][2],`8*22`(r3)
468	ld	$A[4][3],`8*23`(r3)
469	ld	$A[4][4],`8*24`(r3)
470
471	mr	r3,r4
472	mr	r4,r5
473	mr	r5,r0
474
475	b	.Loop_absorb
476
477.align	4
478.Loop_absorb:
479	$UCMP	r4,r5				; len < bsz?
480	blt	.Labsorbed
481
482	sub	r4,r4,r5			; len -= bsz
483	srwi	r5,r5,3
484	$PUSH	r4,`$LOCALS+2*$SIZE_T`($sp)	; save len
485	mtctr	r5
486	bl	dword_le_load			; *inp++
487	xor	$A[0][0],$A[0][0],r0
488	bdz	.Lprocess_block
489	bl	dword_le_load			; *inp++
490	xor	$A[0][1],$A[0][1],r0
491	bdz	.Lprocess_block
492	bl	dword_le_load			; *inp++
493	xor	$A[0][2],$A[0][2],r0
494	bdz	.Lprocess_block
495	bl	dword_le_load			; *inp++
496	xor	$A[0][3],$A[0][3],r0
497	bdz	.Lprocess_block
498	bl	dword_le_load			; *inp++
499	xor	$A[0][4],$A[0][4],r0
500	bdz	.Lprocess_block
501	bl	dword_le_load			; *inp++
502	xor	$A[1][0],$A[1][0],r0
503	bdz	.Lprocess_block
504	bl	dword_le_load			; *inp++
505	xor	$A[1][1],$A[1][1],r0
506	bdz	.Lprocess_block
507	bl	dword_le_load			; *inp++
508	xor	$A[1][2],$A[1][2],r0
509	bdz	.Lprocess_block
510	bl	dword_le_load			; *inp++
511	xor	$A[1][3],$A[1][3],r0
512	bdz	.Lprocess_block
513	bl	dword_le_load			; *inp++
514	xor	$A[1][4],$A[1][4],r0
515	bdz	.Lprocess_block
516	bl	dword_le_load			; *inp++
517	xor	$A[2][0],$A[2][0],r0
518	bdz	.Lprocess_block
519	bl	dword_le_load			; *inp++
520	xor	$A[2][1],$A[2][1],r0
521	bdz	.Lprocess_block
522	bl	dword_le_load			; *inp++
523	xor	$A[2][2],$A[2][2],r0
524	bdz	.Lprocess_block
525	bl	dword_le_load			; *inp++
526	xor	$A[2][3],$A[2][3],r0
527	bdz	.Lprocess_block
528	bl	dword_le_load			; *inp++
529	xor	$A[2][4],$A[2][4],r0
530	bdz	.Lprocess_block
531	bl	dword_le_load			; *inp++
532	xor	$A[3][0],$A[3][0],r0
533	bdz	.Lprocess_block
534	bl	dword_le_load			; *inp++
535	xor	$A[3][1],$A[3][1],r0
536	bdz	.Lprocess_block
537	bl	dword_le_load			; *inp++
538	xor	$A[3][2],$A[3][2],r0
539	bdz	.Lprocess_block
540	bl	dword_le_load			; *inp++
541	xor	$A[3][3],$A[3][3],r0
542	bdz	.Lprocess_block
543	bl	dword_le_load			; *inp++
544	xor	$A[3][4],$A[3][4],r0
545	bdz	.Lprocess_block
546	bl	dword_le_load			; *inp++
547	xor	$A[4][0],$A[4][0],r0
548	bdz	.Lprocess_block
549	bl	dword_le_load			; *inp++
550	xor	$A[4][1],$A[4][1],r0
551	bdz	.Lprocess_block
552	bl	dword_le_load			; *inp++
553	xor	$A[4][2],$A[4][2],r0
554	bdz	.Lprocess_block
555	bl	dword_le_load			; *inp++
556	xor	$A[4][3],$A[4][3],r0
557	bdz	.Lprocess_block
558	bl	dword_le_load			; *inp++
559	xor	$A[4][4],$A[4][4],r0
560
561.Lprocess_block:
562	$PUSH	r3,`$LOCALS+1*$SIZE_T`($sp)	; save inp
563
564	bl	KeccakF1600_int
565
566	$POP	r0,`$LOCALS+4*$SIZE_T`($sp)	; pull iotas[24]
567	$POP	r5,`$LOCALS+3*$SIZE_T`($sp)	; restore bsz
568	$POP	r4,`$LOCALS+2*$SIZE_T`($sp)	; restore len
569	$POP	r3,`$LOCALS+1*$SIZE_T`($sp)	; restore inp
570	addic	r0,r0,`-8*24`			; rewind iotas
571	$PUSH	r0,`$LOCALS+4*$SIZE_T`($sp)
572
573	b	.Loop_absorb
574
575.align	4
576.Labsorbed:
577	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
578	std	$A[0][0],`8*0`(r3)		; return A[5][5]
579	std	$A[0][1],`8*1`(r3)
580	std	$A[0][2],`8*2`(r3)
581	std	$A[0][3],`8*3`(r3)
582	std	$A[0][4],`8*4`(r3)
583	std	$A[1][0],`8*5`(r3)
584	std	$A[1][1],`8*6`(r3)
585	std	$A[1][2],`8*7`(r3)
586	std	$A[1][3],`8*8`(r3)
587	std	$A[1][4],`8*9`(r3)
588	std	$A[2][0],`8*10`(r3)
589	std	$A[2][1],`8*11`(r3)
590	std	$A[2][2],`8*12`(r3)
591	std	$A[2][3],`8*13`(r3)
592	std	$A[2][4],`8*14`(r3)
593	std	$A[3][0],`8*15`(r3)
594	std	$A[3][1],`8*16`(r3)
595	std	$A[3][2],`8*17`(r3)
596	std	$A[3][3],`8*18`(r3)
597	std	$A[3][4],`8*19`(r3)
598	std	$A[4][0],`8*20`(r3)
599	std	$A[4][1],`8*21`(r3)
600	std	$A[4][2],`8*22`(r3)
601	std	$A[4][3],`8*23`(r3)
602	std	$A[4][4],`8*24`(r3)
603
604	mr	r3,r4				; return value
605	$POP	r0,`$FRAME+$LRSAVE`($sp)
606	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
607	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
608	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
609	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
610	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
611	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
612	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
613	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
614	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
615	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
616	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
617	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
618	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
619	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
620	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
621	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
622	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
623	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
624	mtlr	r0
625	addi	$sp,$sp,$FRAME
626	blr
627	.long	0
628	.byte	0,12,4,1,0x80,18,4,0
629	.long	0
630.size	SHA3_absorb,.-SHA3_absorb
631___
632{
633my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
634$code.=<<___;
635.globl	SHA3_squeeze
636.type	SHA3_squeeze,\@function
637.align	5
638SHA3_squeeze:
639	$STU	$sp,`-10*$SIZE_T`($sp)
640	mflr	r0
641	$PUSH	r28,`6*$SIZE_T`($sp)
642	$PUSH	r29,`7*$SIZE_T`($sp)
643	$PUSH	r30,`8*$SIZE_T`($sp)
644	$PUSH	r31,`9*$SIZE_T`($sp)
645	$PUSH	r0,`10*$SIZE_T+$LRSAVE`($sp)
646
647	mr	$A_flat,r3
648	subi	r3,r3,8			; prepare for ldu
649	subi	$out,r4,1		; prepare for stbu
650	mr	$len,r5
651	mr	$bsz,r6
652	b	.Loop_squeeze
653
654.align	4
655.Loop_squeeze:
656	ldu	r0,8(r3)
657	${UCMP}i $len,8
658	blt	.Lsqueeze_tail
659
660	stbu	r0,1($out)
661	srdi	r0,r0,8
662	stbu	r0,1($out)
663	srdi	r0,r0,8
664	stbu	r0,1($out)
665	srdi	r0,r0,8
666	stbu	r0,1($out)
667	srdi	r0,r0,8
668	stbu	r0,1($out)
669	srdi	r0,r0,8
670	stbu	r0,1($out)
671	srdi	r0,r0,8
672	stbu	r0,1($out)
673	srdi	r0,r0,8
674	stbu	r0,1($out)
675
676	subic.	$len,$len,8
677	beq	.Lsqueeze_done
678
679	subic.	r6,r6,8
680	bgt	.Loop_squeeze
681
682	mr	r3,$A_flat
683	bl	KeccakF1600
684	subi	r3,$A_flat,8		; prepare for ldu
685	mr	r6,$bsz
686	b	.Loop_squeeze
687
688.align	4
689.Lsqueeze_tail:
690	mtctr	$len
691.Loop_tail:
692	stbu	r0,1($out)
693	srdi	r0,r0,8
694	bdnz	.Loop_tail
695
696.Lsqueeze_done:
697	$POP	r0,`10*$SIZE_T+$LRSAVE`($sp)
698	$POP	r28,`6*$SIZE_T`($sp)
699	$POP	r29,`7*$SIZE_T`($sp)
700	$POP	r30,`8*$SIZE_T`($sp)
701	$POP	r31,`9*$SIZE_T`($sp)
702	mtlr	r0
703	addi	$sp,$sp,`10*$SIZE_T`
704	blr
705	.long	0
706	.byte	0,12,4,1,0x80,4,4,0
707	.long	0
708.size	SHA3_squeeze,.-SHA3_squeeze
709___
710}
711
712# Ugly hack here, because PPC assembler syntax seem to vary too
713# much from platforms to platform...
714$code.=<<___;
715.align	6
716PICmeup:
717	mflr	r0
718	bcl	20,31,\$+4
719	mflr	r12   ; vvvvvv "distance" between . and 1st data entry
720	addi	r12,r12,`64-8`
721	mtlr	r0
722	blr
723	.long	0
724	.byte	0,12,0x14,0,0,0,0,0
725	.space	`64-9*4`
726.type	iotas,\@object
727iotas:
728	.quad	0x0000000000000001
729	.quad	0x0000000000008082
730	.quad	0x800000000000808a
731	.quad	0x8000000080008000
732	.quad	0x000000000000808b
733	.quad	0x0000000080000001
734	.quad	0x8000000080008081
735	.quad	0x8000000000008009
736	.quad	0x000000000000008a
737	.quad	0x0000000000000088
738	.quad	0x0000000080008009
739	.quad	0x000000008000000a
740	.quad	0x000000008000808b
741	.quad	0x800000000000008b
742	.quad	0x8000000000008089
743	.quad	0x8000000000008003
744	.quad	0x8000000000008002
745	.quad	0x8000000000000080
746	.quad	0x000000000000800a
747	.quad	0x800000008000000a
748	.quad	0x8000000080008081
749	.quad	0x8000000000008080
750	.quad	0x0000000080000001
751	.quad	0x8000000080008008
752.size	iotas,.-iotas
753.asciz	"Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
754___
755
756$code =~ s/\`([^\`]*)\`/eval $1/gem;
757print $code;
758close STDOUT or die "error closing STDOUT: $!";
759