1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for PPC64.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation that works on
21# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
22# it's possible to achieve performance better than below, but that is
23# naturally option only for POWER8 and successors...
24#
25######################################################################
26# Numbers are cycles per processed byte.
27#
28#		r=1088(*)
29#
30# PPC970/G5	14.0/+130%
31# POWER7	9.7/+110%
32# POWER8	10.6/+100%
33# POWER9	8.2/+66%
34#
35# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
36#	over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
37#	much better (but watch out for them generating code specific
38#	to processor they execute on).
39
40# $output is the last argument if it looks like a file (it has an extension)
41# $flavour is the first argument if it doesn't look like a file
42$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
43$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
44
45if ($flavour =~ /64/) {
46	$SIZE_T	=8;
47	$LRSAVE	=2*$SIZE_T;
48	$UCMP	="cmpld";
49	$STU	="stdu";
50	$POP	="ld";
51	$PUSH	="std";
52} else { die "nonsense $flavour"; }
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
57die "can't locate ppc-xlate.pl";
58
59open STDOUT,"| $^X $xlate $flavour \"$output\""
60    or die "can't call $xlate: $!";
61
62$FRAME=24*$SIZE_T+6*$SIZE_T+32;
63$LOCALS=6*$SIZE_T;
64$TEMP=$LOCALS+6*$SIZE_T;
65
66my $sp ="r1";
67
68my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
69            (7, 12, 17, 22, 27));
70   $A[1][1] = "r6"; # r13 is reserved
71
72my @C = map("r$_", (0,3,4,5));
73
74my @rhotates = ([  0,  1, 62, 28, 27 ],
75                [ 36, 44,  6, 55, 20 ],
76                [  3, 10, 43, 25, 39 ],
77                [ 41, 45, 15, 21,  8 ],
78                [ 18,  2, 61, 56, 14 ]);
79
80$code.=<<___;
81.text
82
83.type	KeccakF1600_int,\@function
84.align	5
85KeccakF1600_int:
86	li	r0,24
87	mtctr	r0
88	b	.Loop
89.align	4
90.Loop:
91	xor	$C[0],$A[0][0],$A[1][0]		; Theta
92	std	$A[0][4],`$TEMP+0`($sp)
93	xor	$C[1],$A[0][1],$A[1][1]
94	std	$A[1][4],`$TEMP+8`($sp)
95	xor	$C[2],$A[0][2],$A[1][2]
96	std	$A[2][4],`$TEMP+16`($sp)
97	xor	$C[3],$A[0][3],$A[1][3]
98	std	$A[3][4],`$TEMP+24`($sp)
99___
100	$C[4]=$A[0][4];
101	$C[5]=$A[1][4];
102	$C[6]=$A[2][4];
103	$C[7]=$A[3][4];
104$code.=<<___;
105	xor	$C[4],$A[0][4],$A[1][4]
106	xor	$C[0],$C[0],$A[2][0]
107	xor	$C[1],$C[1],$A[2][1]
108	xor	$C[2],$C[2],$A[2][2]
109	xor	$C[3],$C[3],$A[2][3]
110	xor	$C[4],$C[4],$A[2][4]
111	xor	$C[0],$C[0],$A[3][0]
112	xor	$C[1],$C[1],$A[3][1]
113	xor	$C[2],$C[2],$A[3][2]
114	xor	$C[3],$C[3],$A[3][3]
115	xor	$C[4],$C[4],$A[3][4]
116	xor	$C[0],$C[0],$A[4][0]
117	xor	$C[2],$C[2],$A[4][2]
118	xor	$C[1],$C[1],$A[4][1]
119	xor	$C[3],$C[3],$A[4][3]
120	rotldi	$C[5],$C[2],1
121	xor	$C[4],$C[4],$A[4][4]
122	rotldi	$C[6],$C[3],1
123	xor	$C[5],$C[5],$C[0]
124	rotldi	$C[7],$C[4],1
125
126	xor	$A[0][1],$A[0][1],$C[5]
127	xor	$A[1][1],$A[1][1],$C[5]
128	xor	$A[2][1],$A[2][1],$C[5]
129	xor	$A[3][1],$A[3][1],$C[5]
130	xor	$A[4][1],$A[4][1],$C[5]
131
132	rotldi	$C[5],$C[0],1
133	xor	$C[6],$C[6],$C[1]
134	xor	$C[2],$C[2],$C[7]
135	rotldi	$C[7],$C[1],1
136	xor	$C[3],$C[3],$C[5]
137	xor	$C[4],$C[4],$C[7]
138
139	xor	$C[1],   $A[0][2],$C[6]			;mr	$C[1],$A[0][2]
140	xor	$A[1][2],$A[1][2],$C[6]
141	xor	$A[2][2],$A[2][2],$C[6]
142	xor	$A[3][2],$A[3][2],$C[6]
143	xor	$A[4][2],$A[4][2],$C[6]
144
145	xor	$A[0][0],$A[0][0],$C[4]
146	xor	$A[1][0],$A[1][0],$C[4]
147	xor	$A[2][0],$A[2][0],$C[4]
148	xor	$A[3][0],$A[3][0],$C[4]
149	xor	$A[4][0],$A[4][0],$C[4]
150___
151	$C[4]=undef;
152	$C[5]=undef;
153	$C[6]=undef;
154	$C[7]=undef;
155$code.=<<___;
156	ld	$A[0][4],`$TEMP+0`($sp)
157	xor	$C[0],   $A[0][3],$C[2]			;mr	$C[0],$A[0][3]
158	ld	$A[1][4],`$TEMP+8`($sp)
159	xor	$A[1][3],$A[1][3],$C[2]
160	ld	$A[2][4],`$TEMP+16`($sp)
161	xor	$A[2][3],$A[2][3],$C[2]
162	ld	$A[3][4],`$TEMP+24`($sp)
163	xor	$A[3][3],$A[3][3],$C[2]
164	xor	$A[4][3],$A[4][3],$C[2]
165
166	xor	$C[2],   $A[0][4],$C[3]			;mr	$C[2],$A[0][4]
167	xor	$A[1][4],$A[1][4],$C[3]
168	xor	$A[2][4],$A[2][4],$C[3]
169	xor	$A[3][4],$A[3][4],$C[3]
170	xor	$A[4][4],$A[4][4],$C[3]
171
172	mr	$C[3],$A[0][1]				; Rho+Pi
173	rotldi	$A[0][1],$A[1][1],$rhotates[1][1]
174	;mr	$C[1],$A[0][2]
175	rotldi	$A[0][2],$A[2][2],$rhotates[2][2]
176	;mr	$C[0],$A[0][3]
177	rotldi	$A[0][3],$A[3][3],$rhotates[3][3]
178	;mr	$C[2],$A[0][4]
179	rotldi	$A[0][4],$A[4][4],$rhotates[4][4]
180
181	rotldi	$A[1][1],$A[1][4],$rhotates[1][4]
182	rotldi	$A[2][2],$A[2][3],$rhotates[2][3]
183	rotldi	$A[3][3],$A[3][2],$rhotates[3][2]
184	rotldi	$A[4][4],$A[4][1],$rhotates[4][1]
185
186	rotldi	$A[1][4],$A[4][2],$rhotates[4][2]
187	rotldi	$A[2][3],$A[3][4],$rhotates[3][4]
188	rotldi	$A[3][2],$A[2][1],$rhotates[2][1]
189	rotldi	$A[4][1],$A[1][3],$rhotates[1][3]
190
191	rotldi	$A[4][2],$A[2][4],$rhotates[2][4]
192	rotldi	$A[3][4],$A[4][3],$rhotates[4][3]
193	rotldi	$A[2][1],$A[1][2],$rhotates[1][2]
194	rotldi	$A[1][3],$A[3][1],$rhotates[3][1]
195
196	rotldi	$A[2][4],$A[4][0],$rhotates[4][0]
197	rotldi	$A[4][3],$A[3][0],$rhotates[3][0]
198	rotldi	$A[1][2],$A[2][0],$rhotates[2][0]
199	rotldi	$A[3][1],$A[1][0],$rhotates[1][0]
200
201	rotldi	$A[1][0],$C[0],$rhotates[0][3]
202	rotldi	$A[2][0],$C[3],$rhotates[0][1]
203	rotldi	$A[3][0],$C[2],$rhotates[0][4]
204	rotldi	$A[4][0],$C[1],$rhotates[0][2]
205
206	andc	$C[0],$A[0][2],$A[0][1]			; Chi+Iota
207	andc	$C[1],$A[0][3],$A[0][2]
208	andc	$C[2],$A[0][0],$A[0][4]
209	andc	$C[3],$A[0][1],$A[0][0]
210	xor	$A[0][0],$A[0][0],$C[0]
211	andc	$C[0],$A[0][4],$A[0][3]
212	xor	$A[0][1],$A[0][1],$C[1]
213	 ld	$C[1],`$LOCALS+4*$SIZE_T`($sp)
214	xor	$A[0][3],$A[0][3],$C[2]
215	xor	$A[0][4],$A[0][4],$C[3]
216	xor	$A[0][2],$A[0][2],$C[0]
217	 ldu	$C[3],8($C[1])				; Iota[i++]
218
219	andc	$C[0],$A[1][2],$A[1][1]
220	 std	$C[1],`$LOCALS+4*$SIZE_T`($sp)
221	andc	$C[1],$A[1][3],$A[1][2]
222	andc	$C[2],$A[1][0],$A[1][4]
223	 xor	$A[0][0],$A[0][0],$C[3]			; A[0][0] ^= Iota
224	andc	$C[3],$A[1][1],$A[1][0]
225	xor	$A[1][0],$A[1][0],$C[0]
226	andc	$C[0],$A[1][4],$A[1][3]
227	xor	$A[1][1],$A[1][1],$C[1]
228	xor	$A[1][3],$A[1][3],$C[2]
229	xor	$A[1][4],$A[1][4],$C[3]
230	xor	$A[1][2],$A[1][2],$C[0]
231
232	andc	$C[0],$A[2][2],$A[2][1]
233	andc	$C[1],$A[2][3],$A[2][2]
234	andc	$C[2],$A[2][0],$A[2][4]
235	andc	$C[3],$A[2][1],$A[2][0]
236	xor	$A[2][0],$A[2][0],$C[0]
237	andc	$C[0],$A[2][4],$A[2][3]
238	xor	$A[2][1],$A[2][1],$C[1]
239	xor	$A[2][3],$A[2][3],$C[2]
240	xor	$A[2][4],$A[2][4],$C[3]
241	xor	$A[2][2],$A[2][2],$C[0]
242
243	andc	$C[0],$A[3][2],$A[3][1]
244	andc	$C[1],$A[3][3],$A[3][2]
245	andc	$C[2],$A[3][0],$A[3][4]
246	andc	$C[3],$A[3][1],$A[3][0]
247	xor	$A[3][0],$A[3][0],$C[0]
248	andc	$C[0],$A[3][4],$A[3][3]
249	xor	$A[3][1],$A[3][1],$C[1]
250	xor	$A[3][3],$A[3][3],$C[2]
251	xor	$A[3][4],$A[3][4],$C[3]
252	xor	$A[3][2],$A[3][2],$C[0]
253
254	andc	$C[0],$A[4][2],$A[4][1]
255	andc	$C[1],$A[4][3],$A[4][2]
256	andc	$C[2],$A[4][0],$A[4][4]
257	andc	$C[3],$A[4][1],$A[4][0]
258	xor	$A[4][0],$A[4][0],$C[0]
259	andc	$C[0],$A[4][4],$A[4][3]
260	xor	$A[4][1],$A[4][1],$C[1]
261	xor	$A[4][3],$A[4][3],$C[2]
262	xor	$A[4][4],$A[4][4],$C[3]
263	xor	$A[4][2],$A[4][2],$C[0]
264
265	bdnz	.Loop
266
267	blr
268	.long	0
269	.byte	0,12,0x14,0,0,0,0,0
270.size	KeccakF1600_int,.-KeccakF1600_int
271
272.type	KeccakF1600,\@function
273.align	5
274KeccakF1600:
275	$STU	$sp,-$FRAME($sp)
276	mflr	r0
277	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
278	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
279	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
280	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
281	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
282	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
283	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
284	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
285	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
286	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
287	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
288	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
289	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
290	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
291	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
292	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
293	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
294	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
295	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
296
297	bl	PICmeup
298	subi	r12,r12,8			; prepare for ldu
299
300	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)
301	;$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)
302	;$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)
303	;$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)
304	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
305
306	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
307	ld	$A[0][1],`8*1`(r3)
308	ld	$A[0][2],`8*2`(r3)
309	ld	$A[0][3],`8*3`(r3)
310	ld	$A[0][4],`8*4`(r3)
311	ld	$A[1][0],`8*5`(r3)
312	ld	$A[1][1],`8*6`(r3)
313	ld	$A[1][2],`8*7`(r3)
314	ld	$A[1][3],`8*8`(r3)
315	ld	$A[1][4],`8*9`(r3)
316	ld	$A[2][0],`8*10`(r3)
317	ld	$A[2][1],`8*11`(r3)
318	ld	$A[2][2],`8*12`(r3)
319	ld	$A[2][3],`8*13`(r3)
320	ld	$A[2][4],`8*14`(r3)
321	ld	$A[3][0],`8*15`(r3)
322	ld	$A[3][1],`8*16`(r3)
323	ld	$A[3][2],`8*17`(r3)
324	ld	$A[3][3],`8*18`(r3)
325	ld	$A[3][4],`8*19`(r3)
326	ld	$A[4][0],`8*20`(r3)
327	ld	$A[4][1],`8*21`(r3)
328	ld	$A[4][2],`8*22`(r3)
329	ld	$A[4][3],`8*23`(r3)
330	ld	$A[4][4],`8*24`(r3)
331
332	bl	KeccakF1600_int
333
334	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
335	std	$A[0][0],`8*0`(r3)		; return A[5][5]
336	std	$A[0][1],`8*1`(r3)
337	std	$A[0][2],`8*2`(r3)
338	std	$A[0][3],`8*3`(r3)
339	std	$A[0][4],`8*4`(r3)
340	std	$A[1][0],`8*5`(r3)
341	std	$A[1][1],`8*6`(r3)
342	std	$A[1][2],`8*7`(r3)
343	std	$A[1][3],`8*8`(r3)
344	std	$A[1][4],`8*9`(r3)
345	std	$A[2][0],`8*10`(r3)
346	std	$A[2][1],`8*11`(r3)
347	std	$A[2][2],`8*12`(r3)
348	std	$A[2][3],`8*13`(r3)
349	std	$A[2][4],`8*14`(r3)
350	std	$A[3][0],`8*15`(r3)
351	std	$A[3][1],`8*16`(r3)
352	std	$A[3][2],`8*17`(r3)
353	std	$A[3][3],`8*18`(r3)
354	std	$A[3][4],`8*19`(r3)
355	std	$A[4][0],`8*20`(r3)
356	std	$A[4][1],`8*21`(r3)
357	std	$A[4][2],`8*22`(r3)
358	std	$A[4][3],`8*23`(r3)
359	std	$A[4][4],`8*24`(r3)
360
361	$POP	r0,`$FRAME+$LRSAVE`($sp)
362	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
363	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
364	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
365	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
366	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
367	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
368	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
369	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
370	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
371	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
372	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
373	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
374	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
375	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
376	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
377	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
378	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
379	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
380	mtlr	r0
381	addi	$sp,$sp,$FRAME
382	blr
383	.long	0
384	.byte	0,12,4,1,0x80,18,1,0
385	.long	0
386.size	KeccakF1600,.-KeccakF1600
387
388.type	dword_le_load,\@function
389.align	5
390dword_le_load:
391	lbz	r0,1(r3)
392	lbz	r4,2(r3)
393	lbz	r5,3(r3)
394	insrdi	r0,r4,8,48
395	lbz	r4,4(r3)
396	insrdi	r0,r5,8,40
397	lbz	r5,5(r3)
398	insrdi	r0,r4,8,32
399	lbz	r4,6(r3)
400	insrdi	r0,r5,8,24
401	lbz	r5,7(r3)
402	insrdi	r0,r4,8,16
403	lbzu	r4,8(r3)
404	insrdi	r0,r5,8,8
405	insrdi	r0,r4,8,0
406	blr
407	.long	0
408	.byte	0,12,0x14,0,0,0,1,0
409	.long	0
410.size	dword_le_load,.-dword_le_load
411
412.globl	SHA3_absorb
413.type	SHA3_absorb,\@function
414.align	5
415SHA3_absorb:
416	$STU	$sp,-$FRAME($sp)
417	mflr	r0
418	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
419	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
420	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
421	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
422	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
423	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
424	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
425	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
426	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
427	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
428	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
429	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
430	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
431	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
432	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
433	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
434	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
435	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
436	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
437
438	bl	PICmeup
439	subi	r4,r4,1				; prepare for lbzu
440	subi	r12,r12,8			; prepare for ldu
441
442	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)	; save A[][]
443	$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)	; save inp
444	$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)	; save len
445	$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)	; save bsz
446	mr	r0,r6
447	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
448
449	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
450	ld	$A[0][1],`8*1`(r3)
451	ld	$A[0][2],`8*2`(r3)
452	ld	$A[0][3],`8*3`(r3)
453	ld	$A[0][4],`8*4`(r3)
454	ld	$A[1][0],`8*5`(r3)
455	ld	$A[1][1],`8*6`(r3)
456	ld	$A[1][2],`8*7`(r3)
457	ld	$A[1][3],`8*8`(r3)
458	ld	$A[1][4],`8*9`(r3)
459	ld	$A[2][0],`8*10`(r3)
460	ld	$A[2][1],`8*11`(r3)
461	ld	$A[2][2],`8*12`(r3)
462	ld	$A[2][3],`8*13`(r3)
463	ld	$A[2][4],`8*14`(r3)
464	ld	$A[3][0],`8*15`(r3)
465	ld	$A[3][1],`8*16`(r3)
466	ld	$A[3][2],`8*17`(r3)
467	ld	$A[3][3],`8*18`(r3)
468	ld	$A[3][4],`8*19`(r3)
469	ld	$A[4][0],`8*20`(r3)
470	ld	$A[4][1],`8*21`(r3)
471	ld	$A[4][2],`8*22`(r3)
472	ld	$A[4][3],`8*23`(r3)
473	ld	$A[4][4],`8*24`(r3)
474
475	mr	r3,r4
476	mr	r4,r5
477	mr	r5,r0
478
479	b	.Loop_absorb
480
481.align	4
482.Loop_absorb:
483	$UCMP	r4,r5				; len < bsz?
484	blt	.Labsorbed
485
486	sub	r4,r4,r5			; len -= bsz
487	srwi	r5,r5,3
488	$PUSH	r4,`$LOCALS+2*$SIZE_T`($sp)	; save len
489	mtctr	r5
490	bl	dword_le_load			; *inp++
491	xor	$A[0][0],$A[0][0],r0
492	bdz	.Lprocess_block
493	bl	dword_le_load			; *inp++
494	xor	$A[0][1],$A[0][1],r0
495	bdz	.Lprocess_block
496	bl	dword_le_load			; *inp++
497	xor	$A[0][2],$A[0][2],r0
498	bdz	.Lprocess_block
499	bl	dword_le_load			; *inp++
500	xor	$A[0][3],$A[0][3],r0
501	bdz	.Lprocess_block
502	bl	dword_le_load			; *inp++
503	xor	$A[0][4],$A[0][4],r0
504	bdz	.Lprocess_block
505	bl	dword_le_load			; *inp++
506	xor	$A[1][0],$A[1][0],r0
507	bdz	.Lprocess_block
508	bl	dword_le_load			; *inp++
509	xor	$A[1][1],$A[1][1],r0
510	bdz	.Lprocess_block
511	bl	dword_le_load			; *inp++
512	xor	$A[1][2],$A[1][2],r0
513	bdz	.Lprocess_block
514	bl	dword_le_load			; *inp++
515	xor	$A[1][3],$A[1][3],r0
516	bdz	.Lprocess_block
517	bl	dword_le_load			; *inp++
518	xor	$A[1][4],$A[1][4],r0
519	bdz	.Lprocess_block
520	bl	dword_le_load			; *inp++
521	xor	$A[2][0],$A[2][0],r0
522	bdz	.Lprocess_block
523	bl	dword_le_load			; *inp++
524	xor	$A[2][1],$A[2][1],r0
525	bdz	.Lprocess_block
526	bl	dword_le_load			; *inp++
527	xor	$A[2][2],$A[2][2],r0
528	bdz	.Lprocess_block
529	bl	dword_le_load			; *inp++
530	xor	$A[2][3],$A[2][3],r0
531	bdz	.Lprocess_block
532	bl	dword_le_load			; *inp++
533	xor	$A[2][4],$A[2][4],r0
534	bdz	.Lprocess_block
535	bl	dword_le_load			; *inp++
536	xor	$A[3][0],$A[3][0],r0
537	bdz	.Lprocess_block
538	bl	dword_le_load			; *inp++
539	xor	$A[3][1],$A[3][1],r0
540	bdz	.Lprocess_block
541	bl	dword_le_load			; *inp++
542	xor	$A[3][2],$A[3][2],r0
543	bdz	.Lprocess_block
544	bl	dword_le_load			; *inp++
545	xor	$A[3][3],$A[3][3],r0
546	bdz	.Lprocess_block
547	bl	dword_le_load			; *inp++
548	xor	$A[3][4],$A[3][4],r0
549	bdz	.Lprocess_block
550	bl	dword_le_load			; *inp++
551	xor	$A[4][0],$A[4][0],r0
552	bdz	.Lprocess_block
553	bl	dword_le_load			; *inp++
554	xor	$A[4][1],$A[4][1],r0
555	bdz	.Lprocess_block
556	bl	dword_le_load			; *inp++
557	xor	$A[4][2],$A[4][2],r0
558	bdz	.Lprocess_block
559	bl	dword_le_load			; *inp++
560	xor	$A[4][3],$A[4][3],r0
561	bdz	.Lprocess_block
562	bl	dword_le_load			; *inp++
563	xor	$A[4][4],$A[4][4],r0
564
565.Lprocess_block:
566	$PUSH	r3,`$LOCALS+1*$SIZE_T`($sp)	; save inp
567
568	bl	KeccakF1600_int
569
570	$POP	r0,`$LOCALS+4*$SIZE_T`($sp)	; pull iotas[24]
571	$POP	r5,`$LOCALS+3*$SIZE_T`($sp)	; restore bsz
572	$POP	r4,`$LOCALS+2*$SIZE_T`($sp)	; restore len
573	$POP	r3,`$LOCALS+1*$SIZE_T`($sp)	; restore inp
574	addic	r0,r0,`-8*24`			; rewind iotas
575	$PUSH	r0,`$LOCALS+4*$SIZE_T`($sp)
576
577	b	.Loop_absorb
578
579.align	4
580.Labsorbed:
581	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
582	std	$A[0][0],`8*0`(r3)		; return A[5][5]
583	std	$A[0][1],`8*1`(r3)
584	std	$A[0][2],`8*2`(r3)
585	std	$A[0][3],`8*3`(r3)
586	std	$A[0][4],`8*4`(r3)
587	std	$A[1][0],`8*5`(r3)
588	std	$A[1][1],`8*6`(r3)
589	std	$A[1][2],`8*7`(r3)
590	std	$A[1][3],`8*8`(r3)
591	std	$A[1][4],`8*9`(r3)
592	std	$A[2][0],`8*10`(r3)
593	std	$A[2][1],`8*11`(r3)
594	std	$A[2][2],`8*12`(r3)
595	std	$A[2][3],`8*13`(r3)
596	std	$A[2][4],`8*14`(r3)
597	std	$A[3][0],`8*15`(r3)
598	std	$A[3][1],`8*16`(r3)
599	std	$A[3][2],`8*17`(r3)
600	std	$A[3][3],`8*18`(r3)
601	std	$A[3][4],`8*19`(r3)
602	std	$A[4][0],`8*20`(r3)
603	std	$A[4][1],`8*21`(r3)
604	std	$A[4][2],`8*22`(r3)
605	std	$A[4][3],`8*23`(r3)
606	std	$A[4][4],`8*24`(r3)
607
608	mr	r3,r4				; return value
609	$POP	r0,`$FRAME+$LRSAVE`($sp)
610	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
611	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
612	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
613	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
614	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
615	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
616	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
617	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
618	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
619	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
620	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
621	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
622	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
623	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
624	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
625	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
626	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
627	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
628	mtlr	r0
629	addi	$sp,$sp,$FRAME
630	blr
631	.long	0
632	.byte	0,12,4,1,0x80,18,4,0
633	.long	0
634.size	SHA3_absorb,.-SHA3_absorb
635___
636{
637my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
638$code.=<<___;
639.globl	SHA3_squeeze
640.type	SHA3_squeeze,\@function
641.align	5
642SHA3_squeeze:
643	$STU	$sp,`-10*$SIZE_T`($sp)
644	mflr	r0
645	$PUSH	r28,`6*$SIZE_T`($sp)
646	$PUSH	r29,`7*$SIZE_T`($sp)
647	$PUSH	r30,`8*$SIZE_T`($sp)
648	$PUSH	r31,`9*$SIZE_T`($sp)
649	$PUSH	r0,`10*$SIZE_T+$LRSAVE`($sp)
650
651	mr	$A_flat,r3
652	subi	r3,r3,8			; prepare for ldu
653	subi	$out,r4,1		; prepare for stbu
654	mr	$len,r5
655	mr	$bsz,r6
656	b	.Loop_squeeze
657
658.align	4
659.Loop_squeeze:
660	ldu	r0,8(r3)
661	${UCMP}i $len,8
662	blt	.Lsqueeze_tail
663
664	stb	r0,1($out)
665	srdi	r0,r0,8
666	stb	r0,2($out)
667	srdi	r0,r0,8
668	stb	r0,3($out)
669	srdi	r0,r0,8
670	stb	r0,4($out)
671	srdi	r0,r0,8
672	stb	r0,5($out)
673	srdi	r0,r0,8
674	stb	r0,6($out)
675	srdi	r0,r0,8
676	stb	r0,7($out)
677	srdi	r0,r0,8
678	stbu	r0,8($out)
679
680	subic.	$len,$len,8
681	beq	.Lsqueeze_done
682
683	subic.	r6,r6,8
684	bgt	.Loop_squeeze
685
686	mr	r3,$A_flat
687	bl	KeccakF1600
688	subi	r3,$A_flat,8		; prepare for ldu
689	mr	r6,$bsz
690	b	.Loop_squeeze
691
692.align	4
693.Lsqueeze_tail:
694	mtctr	$len
695.Loop_tail:
696	stbu	r0,1($out)
697	srdi	r0,r0,8
698	bdnz	.Loop_tail
699
700.Lsqueeze_done:
701	$POP	r0,`10*$SIZE_T+$LRSAVE`($sp)
702	$POP	r28,`6*$SIZE_T`($sp)
703	$POP	r29,`7*$SIZE_T`($sp)
704	$POP	r30,`8*$SIZE_T`($sp)
705	$POP	r31,`9*$SIZE_T`($sp)
706	mtlr	r0
707	addi	$sp,$sp,`10*$SIZE_T`
708	blr
709	.long	0
710	.byte	0,12,4,1,0x80,4,4,0
711	.long	0
712.size	SHA3_squeeze,.-SHA3_squeeze
713___
714}
715
716# Ugly hack here, because PPC assembler syntax seem to vary too
717# much from platforms to platform...
718$code.=<<___;
719.align	6
720PICmeup:
721	mflr	r0
722	bcl	20,31,\$+4
723	mflr	r12   ; vvvvvv "distance" between . and 1st data entry
724	addi	r12,r12,`64-8`
725	mtlr	r0
726	blr
727	.long	0
728	.byte	0,12,0x14,0,0,0,0,0
729	.space	`64-9*4`
730.type	iotas,\@object
731iotas:
732	.quad	0x0000000000000001
733	.quad	0x0000000000008082
734	.quad	0x800000000000808a
735	.quad	0x8000000080008000
736	.quad	0x000000000000808b
737	.quad	0x0000000080000001
738	.quad	0x8000000080008081
739	.quad	0x8000000000008009
740	.quad	0x000000000000008a
741	.quad	0x0000000000000088
742	.quad	0x0000000080008009
743	.quad	0x000000008000000a
744	.quad	0x000000008000808b
745	.quad	0x800000000000008b
746	.quad	0x8000000000008089
747	.quad	0x8000000000008003
748	.quad	0x8000000000008002
749	.quad	0x8000000000000080
750	.quad	0x000000000000800a
751	.quad	0x800000008000000a
752	.quad	0x8000000080008081
753	.quad	0x8000000000008080
754	.quad	0x0000000080000001
755	.quad	0x8000000080008008
756.size	iotas,.-iotas
757.asciz	"Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
758___
759
760$code =~ s/\`([^\`]*)\`/eval $1/gem;
761print $code;
762close STDOUT or die "error closing STDOUT: $!";
763