1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for PowerISA 2.07.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT SIMD implementation, but with
21# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
22# POWER8 processor spends 9.8 cycles to process byte out of large
23# buffer for r=1088, which matches SHA3-256. This is 17% better than
24# scalar PPC64 code. It probably should be noted that if POWER8's
25# successor can achieve higher scalar instruction issue rate, then
26# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
27
28$flavour = shift;
29
30if ($flavour =~ /64/) {
31	$SIZE_T	=8;
32	$LRSAVE	=2*$SIZE_T;
33	$UCMP	="cmpld";
34	$STU	="stdu";
35	$POP	="ld";
36	$PUSH	="std";
37} elsif ($flavour =~ /32/) {
38	$SIZE_T	=4;
39	$LRSAVE	=$SIZE_T;
40	$STU	="stwu";
41	$POP	="lwz";
42	$PUSH	="stw";
43	$UCMP	="cmplw";
44} else { die "nonsense $flavour"; }
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49die "can't locate ppc-xlate.pl";
50
51open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
54
55my $sp ="r1";
56
57my $iotas = "r12";
58
59########################################################################
60# Register layout:
61#
62# v0		A[0][0] A[1][0]
63# v1		A[0][1] A[1][1]
64# v2		A[0][2] A[1][2]
65# v3		A[0][3] A[1][3]
66# v4		A[0][4] A[1][4]
67#
68# v5		A[2][0] A[3][0]
69# v6		A[2][1] A[3][1]
70# v7		A[2][2] A[3][2]
71# v8		A[2][3] A[3][3]
72# v9		A[2][4] A[3][4]
73#
74# v10		A[4][0] A[4][1]
75# v11		A[4][2] A[4][3]
76# v12		A[4][4] A[4][4]
77#
78# v13..25	rhotates[][]
79# v26..31	volatile
80#
81$code.=<<___;
82.machine	"any"
83.text
84
85.type	KeccakF1600_int,\@function
86.align	5
87KeccakF1600_int:
88	li	r0,24
89	mtctr	r0
90	li	r0,0
91	b	.Loop
92
93.align	4
94.Loop:
95	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
96	vxor	v26,v0, v5		; A[0..1][0]^A[2..3][0]
97	vxor	v27,v1, v6		; A[0..1][1]^A[2..3][1]
98	vxor	v28,v2, v7		; A[0..1][2]^A[2..3][2]
99	vxor	v29,v3, v8		; A[0..1][3]^A[2..3][3]
100	vxor	v30,v4, v9		; A[0..1][4]^A[2..3][4]
101	vpermdi	v31,v26,v27,0b00	; A[0][0..1]^A[2][0..1]
102	vpermdi	v26,v26,v27,0b11	; A[1][0..1]^A[3][0..1]
103	vpermdi	v27,v28,v29,0b00	; A[0][2..3]^A[2][2..3]
104	vpermdi	v28,v28,v29,0b11	; A[1][2..3]^A[3][2..3]
105	vpermdi	v29,v30,v30,0b10	; A[1..0][4]^A[3..2][4]
106	vxor	v26,v26,v31		; C[0..1]
107	vxor	v27,v27,v28		; C[2..3]
108	vxor	v28,v29,v30		; C[4..4]
109	vspltisb v31,1
110	vxor	v26,v26,v10		; C[0..1] ^= A[4][0..1]
111	vxor	v27,v27,v11		; C[2..3] ^= A[4][2..3]
112	vxor	v28,v28,v12		; C[4..4] ^= A[4][4..4], low!
113
114	vrld	v29,v26,v31		; ROL64(C[0..1],1)
115	vrld	v30,v27,v31		; ROL64(C[2..3],1)
116	vrld	v31,v28,v31		; ROL64(C[4..4],1)
117	vpermdi	v31,v31,v29,0b10
118	vxor	v26,v26,v30		; C[0..1] ^= ROL64(C[2..3],1)
119	vxor	v27,v27,v31		; C[2..3] ^= ROL64(C[4..0],1)
120	vxor	v28,v28,v29		; C[4..4] ^= ROL64(C[0..1],1), low!
121
122	vpermdi	v29,v26,v26,0b00	; C[0..0]
123	vpermdi	v30,v28,v26,0b10	; C[4..0]
124	vpermdi	v31,v28,v28,0b11	; C[4..4]
125	vxor	v1, v1, v29		; A[0..1][1] ^= C[0..0]
126	vxor	v6, v6, v29		; A[2..3][1] ^= C[0..0]
127	vxor	v10,v10,v30		; A[4][0..1] ^= C[4..0]
128	vxor	v0, v0, v31		; A[0..1][0] ^= C[4..4]
129	vxor	v5, v5, v31		; A[2..3][0] ^= C[4..4]
130
131	vpermdi	v29,v27,v27,0b00	; C[2..2]
132	vpermdi	v30,v26,v26,0b11	; C[1..1]
133	vpermdi	v31,v26,v27,0b10	; C[1..2]
134	vxor	v3, v3, v29		; A[0..1][3] ^= C[2..2]
135	vxor	v8, v8, v29		; A[2..3][3] ^= C[2..2]
136	vxor	v2, v2, v30		; A[0..1][2] ^= C[1..1]
137	vxor	v7, v7, v30		; A[2..3][2] ^= C[1..1]
138	vxor	v11,v11,v31		; A[4][2..3] ^= C[1..2]
139
140	vpermdi	v29,v27,v27,0b11	; C[3..3]
141	vxor	v4, v4, v29		; A[0..1][4] ^= C[3..3]
142	vxor	v9, v9, v29		; A[2..3][4] ^= C[3..3]
143	vxor	v12,v12,v29		; A[4..4][4] ^= C[3..3]
144
145	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
146	vrld	v26,v0, v13		; v0
147	vrld	v1, v1, v14
148	vrld	v27,v2, v15		; v2
149	vrld	v28,v3, v16		; v3
150	vrld	v4, v4, v17
151	vrld	v5, v5, v18
152	vrld	v6, v6, v19
153	vrld	v29,v7, v20		; v7
154	vrld	v8, v8, v21
155	vrld	v9, v9, v22
156	vrld	v10,v10,v23
157	vrld	v30,v11,v24		; v11
158	vrld	v12,v12,v25
159
160	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
161	vpermdi	v0, v26,v28,0b00	; [0][0] [1][0] < [0][0] [0][3]
162	vpermdi	v2, v29,v5, 0b00	; [0][2] [1][2] < [2][2] [2][0]
163	vpermdi	v11,v9, v5, 0b01	; [4][2] [4][3] < [2][4] [3][0]
164	vpermdi	v5, v1, v4, 0b00	; [2][0] [3][0] < [0][1] [0][4]
165	vpermdi	v1, v1, v4, 0b11	; [0][1] [1][1] < [1][1] [1][4]
166	vpermdi	v3, v8, v6, 0b11	; [0][3] [1][3] < [3][3] [3][1]
167	vpermdi	v4, v12,v30,0b10	; [0][4] [1][4] < [4][4] [4][2]
168	vpermdi	v7, v8, v6, 0b00	; [2][2] [3][2] < [2][3] [2][1]
169	vpermdi	v6, v27,v26,0b11	; [2][1] [3][1] < [1][2] [1][0]
170	vpermdi	v8, v9, v29,0b11	; [2][3] [3][3] < [3][4] [3][2]
171	vpermdi	v12,v10,v10,0b11	; [4][4] [4][4] < [4][1] [4][1]
172	vpermdi	v9, v10,v30,0b01	; [2][4] [3][4] < [4][0] [4][3]
173	vpermdi	v10,v27,v28,0b01	; [4][0] [4][1] < [0][2] [1][3]
174
175	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
176	lvx_u	v31,$iotas,r0		; iotas[index]
177	addic	r0,r0,16		; index++
178
179	vandc	v26,v2, v1		; (~A[0..1][1] & A[0..1][2])
180	vandc	v27,v3, v2		; (~A[0..1][2] & A[0..1][3])
181	vandc	v28,v4, v3		; (~A[0..1][3] & A[0..1][4])
182	vandc	v29,v0, v4		; (~A[0..1][4] & A[0..1][0])
183	vandc	v30,v1, v0		; (~A[0..1][0] & A[0..1][1])
184	vxor	v0, v0, v26		; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
185	vxor	v1, v1, v27		; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
186	vxor	v2, v2, v28		; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
187	vxor	v3, v3, v29		; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
188	vxor	v4, v4, v30		; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
189
190	vandc	v26,v7, v6		; (~A[2..3][1] & A[2..3][2])
191	vandc	v27,v8, v7		; (~A[2..3][2] & A[2..3][3])
192	vandc	v28,v9, v8		; (~A[2..3][3] & A[2..3][4])
193	vandc	v29,v5, v9		; (~A[2..3][4] & A[2..3][0])
194	vandc	v30,v6, v5		; (~A[2..3][0] & A[2..3][1])
195	vxor	v5, v5, v26		; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
196	vxor	v6, v6, v27		; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
197	vxor	v7, v7, v28		; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
198	vxor	v8, v8, v29		; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
199	vxor	v9, v9, v30		; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
200
201	vxor	v0, v0, v31		; A[0][0] ^= iotas[index++]
202
203	vpermdi	v26,v10,v11,0b10	; A[4][1..2]
204	vpermdi	v27,v12,v10,0b00	; A[4][4..0]
205	vpermdi	v28,v11,v12,0b10	; A[4][3..4]
206	vpermdi	v29,v10,v10,0b10	; A[4][1..0]
207	vandc	v26,v11,v26		; (~A[4][1..2] & A[4][2..3])
208	vandc	v27,v27,v28		; (~A[4][3..4] & A[4][4..0])
209	vandc	v28,v10,v29		; (~A[4][1..0] & A[4][0..1])
210	vxor	v10,v10,v26		; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
211	vxor	v11,v11,v27		; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
212	vxor	v12,v12,v28		; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
213
214	bdnz	.Loop
215
216	vpermdi	v12,v12,v12,0b11	; broadcast A[4][4]
217	blr
218	.long	0
219	.byte	0,12,0x14,0,0,0,0,0
220.size	KeccakF1600_int,.-KeccakF1600_int
221
222.type	KeccakF1600,\@function
223.align	5
224KeccakF1600:
225	$STU	$sp,-$FRAME($sp)
226	li	r10,`15+6*$SIZE_T`
227	li	r11,`31+6*$SIZE_T`
228	mflr	r8
229	mfspr	r7, 256			; save vrsave
230	stvx	v20,r10,$sp
231	addi	r10,r10,32
232	stvx	v21,r11,$sp
233	addi	r11,r11,32
234	stvx	v22,r10,$sp
235	addi	r10,r10,32
236	stvx	v23,r11,$sp
237	addi	r11,r11,32
238	stvx	v24,r10,$sp
239	addi	r10,r10,32
240	stvx	v25,r11,$sp
241	addi	r11,r11,32
242	stvx	v26,r10,$sp
243	addi	r10,r10,32
244	stvx	v27,r11,$sp
245	addi	r11,r11,32
246	stvx	v28,r10,$sp
247	addi	r10,r10,32
248	stvx	v29,r11,$sp
249	addi	r11,r11,32
250	stvx	v30,r10,$sp
251	stvx	v31,r11,$sp
252	stw	r7,`$FRAME-4`($sp)	; save vrsave
253	li	r0, -1
254	$PUSH	r8,`$FRAME+$LRSAVE`($sp)
255	mtspr	256, r0			; preserve all AltiVec registers
256
257	li	r11,16
258	lvx_4w	v0,0,r3			; load A[5][5]
259	li	r10,32
260	lvx_4w	v1,r11,r3
261	addi	r11,r11,32
262	lvx_4w	v2,r10,r3
263	addi	r10,r10,32
264	lvx_4w	v3,r11,r3
265	addi	r11,r11,32
266	lvx_4w	v4,r10,r3
267	addi	r10,r10,32
268	lvx_4w	v5,r11,r3
269	addi	r11,r11,32
270	lvx_4w	v6,r10,r3
271	addi	r10,r10,32
272	lvx_4w	v7,r11,r3
273	addi	r11,r11,32
274	lvx_4w	v8,r10,r3
275	addi	r10,r10,32
276	lvx_4w	v9,r11,r3
277	addi	r11,r11,32
278	lvx_4w	v10,r10,r3
279	addi	r10,r10,32
280	lvx_4w	v11,r11,r3
281	lvx_splt v12,r10,r3
282
283	bl	PICmeup
284
285	li	r11,16
286	lvx_u	v13,0,r12		; load rhotates
287	li	r10,32
288	lvx_u	v14,r11,r12
289	addi	r11,r11,32
290	lvx_u	v15,r10,r12
291	addi	r10,r10,32
292	lvx_u	v16,r11,r12
293	addi	r11,r11,32
294	lvx_u	v17,r10,r12
295	addi	r10,r10,32
296	lvx_u	v18,r11,r12
297	addi	r11,r11,32
298	lvx_u	v19,r10,r12
299	addi	r10,r10,32
300	lvx_u	v20,r11,r12
301	addi	r11,r11,32
302	lvx_u	v21,r10,r12
303	addi	r10,r10,32
304	lvx_u	v22,r11,r12
305	addi	r11,r11,32
306	lvx_u	v23,r10,r12
307	addi	r10,r10,32
308	lvx_u	v24,r11,r12
309	lvx_u	v25,r10,r12
310	addi	r12,r12,`16*16`		; points at iotas
311
312	bl	KeccakF1600_int
313
314	li	r11,16
315	stvx_4w	v0,0,r3			; return A[5][5]
316	li	r10,32
317	stvx_4w	v1,r11,r3
318	addi	r11,r11,32
319	stvx_4w	v2,r10,r3
320	addi	r10,r10,32
321	stvx_4w	v3,r11,r3
322	addi	r11,r11,32
323	stvx_4w	v4,r10,r3
324	addi	r10,r10,32
325	stvx_4w	v5,r11,r3
326	addi	r11,r11,32
327	stvx_4w	v6,r10,r3
328	addi	r10,r10,32
329	stvx_4w	v7,r11,r3
330	addi	r11,r11,32
331	stvx_4w	v8,r10,r3
332	addi	r10,r10,32
333	stvx_4w	v9,r11,r3
334	addi	r11,r11,32
335	stvx_4w	v10,r10,r3
336	addi	r10,r10,32
337	stvx_4w	v11,r11,r3
338	stvdx_u v12,r10,r3
339
340	li	r10,`15+6*$SIZE_T`
341	li	r11,`31+6*$SIZE_T`
342	mtlr	r8
343	mtspr	256, r7			; restore vrsave
344	lvx	v20,r10,$sp
345	addi	r10,r10,32
346	lvx	v21,r11,$sp
347	addi	r11,r11,32
348	lvx	v22,r10,$sp
349	addi	r10,r10,32
350	lvx	v23,r11,$sp
351	addi	r11,r11,32
352	lvx	v24,r10,$sp
353	addi	r10,r10,32
354	lvx	v25,r11,$sp
355	addi	r11,r11,32
356	lvx	v26,r10,$sp
357	addi	r10,r10,32
358	lvx	v27,r11,$sp
359	addi	r11,r11,32
360	lvx	v28,r10,$sp
361	addi	r10,r10,32
362	lvx	v29,r11,$sp
363	addi	r11,r11,32
364	lvx	v30,r10,$sp
365	lvx	v31,r11,$sp
366	addi	$sp,$sp,$FRAME
367	blr
368	.long	0
369	.byte	0,12,0x04,1,0x80,0,1,0
370	.long	0
371.size	KeccakF1600,.-KeccakF1600
372___
373{
374my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
375
376$code.=<<___;
377.globl	SHA3_absorb
378.type	SHA3_absorb,\@function
379.align	5
380SHA3_absorb:
381	$STU	$sp,-$FRAME($sp)
382	li	r10,`15+6*$SIZE_T`
383	li	r11,`31+6*$SIZE_T`
384	mflr	r8
385	mfspr	r7, 256			; save vrsave
386	stvx	v20,r10,$sp
387	addi	r10,r10,32
388	stvx	v21,r11,$sp
389	addi	r11,r11,32
390	stvx	v22,r10,$sp
391	addi	r10,r10,32
392	stvx	v23,r11,$sp
393	addi	r11,r11,32
394	stvx	v24,r10,$sp
395	addi	r10,r10,32
396	stvx	v25,r11,$sp
397	addi	r11,r11,32
398	stvx	v26,r10,$sp
399	addi	r10,r10,32
400	stvx	v27,r11,$sp
401	addi	r11,r11,32
402	stvx	v28,r10,$sp
403	addi	r10,r10,32
404	stvx	v29,r11,$sp
405	addi	r11,r11,32
406	stvx	v30,r10,$sp
407	stvx	v31,r11,$sp
408	stw	r7,`$FRAME-4`($sp)	; save vrsave
409	li	r0, -1
410	$PUSH	r8,`$FRAME+$LRSAVE`($sp)
411	mtspr	256, r0			; preserve all AltiVec registers
412
413	li	r11,16
414	lvx_4w	v0,0,$A_jagged		; load A[5][5]
415	li	r10,32
416	lvx_4w	v1,r11,$A_jagged
417	addi	r11,r11,32
418	lvx_4w	v2,r10,$A_jagged
419	addi	r10,r10,32
420	lvx_4w	v3,r11,$A_jagged
421	addi	r11,r11,32
422	lvx_4w	v4,r10,$A_jagged
423	addi	r10,r10,32
424	lvx_4w	v5,r11,$A_jagged
425	addi	r11,r11,32
426	lvx_4w	v6,r10,$A_jagged
427	addi	r10,r10,32
428	lvx_4w	v7,r11,$A_jagged
429	addi	r11,r11,32
430	lvx_4w	v8,r10,$A_jagged
431	addi	r10,r10,32
432	lvx_4w	v9,r11,$A_jagged
433	addi	r11,r11,32
434	lvx_4w	v10,r10,$A_jagged
435	addi	r10,r10,32
436	lvx_4w	v11,r11,$A_jagged
437	lvx_splt v12,r10,$A_jagged
438
439	bl	PICmeup
440
441	li	r11,16
442	lvx_u	v13,0,r12		; load rhotates
443	li	r10,32
444	lvx_u	v14,r11,r12
445	addi	r11,r11,32
446	lvx_u	v15,r10,r12
447	addi	r10,r10,32
448	lvx_u	v16,r11,r12
449	addi	r11,r11,32
450	lvx_u	v17,r10,r12
451	addi	r10,r10,32
452	lvx_u	v18,r11,r12
453	addi	r11,r11,32
454	lvx_u	v19,r10,r12
455	addi	r10,r10,32
456	lvx_u	v20,r11,r12
457	addi	r11,r11,32
458	lvx_u	v21,r10,r12
459	addi	r10,r10,32
460	lvx_u	v22,r11,r12
461	addi	r11,r11,32
462	lvx_u	v23,r10,r12
463	addi	r10,r10,32
464	lvx_u	v24,r11,r12
465	lvx_u	v25,r10,r12
466	li	r10,-32
467	li	r11,-16
468	addi	r12,r12,`16*16`		; points at iotas
469	b	.Loop_absorb
470
471.align	4
472.Loop_absorb:
473	$UCMP	$len,$bsz		; len < bsz?
474	blt	.Labsorbed
475
476	sub	$len,$len,$bsz		; len -= bsz
477	srwi	r0,$bsz,3
478	mtctr	r0
479
480	lvx_u	v30,r10,r12		; permutation masks
481	lvx_u	v31,r11,r12
482	?vspltisb v27,7			; prepare masks for byte swap
483	?vxor	v30,v30,v27		; on big-endian
484	?vxor	v31,v31,v27
485
486	vxor	v27,v27,v27		; zero
487	lvdx_u	v26,0,$inp
488	addi	$inp,$inp,8
489	vperm	v26,v26,v27,v30
490	vxor	v0, v0, v26
491	bdz	.Lprocess_block
492	lvdx_u	v26,0,$inp
493	addi	$inp,$inp,8
494	vperm	v26,v26,v27,v30
495	vxor	v1, v1, v26
496	bdz	.Lprocess_block
497	lvdx_u	v26,0,$inp
498	addi	$inp,$inp,8
499	vperm	v26,v26,v27,v30
500	vxor	v2, v2, v26
501	bdz	.Lprocess_block
502	lvdx_u	v26,0,$inp
503	addi	$inp,$inp,8
504	vperm	v26,v26,v27,v30
505	vxor	v3, v3, v26
506	bdz	.Lprocess_block
507	lvdx_u	v26,0,$inp
508	addi	$inp,$inp,8
509	vperm	v26,v26,v27,v30
510	vxor	v4, v4, v26
511	bdz	.Lprocess_block
512	lvdx_u	v26,0,$inp
513	addi	$inp,$inp,8
514	vperm	v26,v26,v27,v31
515	vxor	v0, v0, v26
516	bdz	.Lprocess_block
517	lvdx_u	v26,0,$inp
518	addi	$inp,$inp,8
519	vperm	v26,v26,v27,v31
520	vxor	v1, v1, v26
521	bdz	.Lprocess_block
522	lvdx_u	v26,0,$inp
523	addi	$inp,$inp,8
524	vperm	v26,v26,v27,v31
525	vxor	v2, v2, v26
526	bdz	.Lprocess_block
527	lvdx_u	v26,0,$inp
528	addi	$inp,$inp,8
529	vperm	v26,v26,v27,v31
530	vxor	v3, v3, v26
531	bdz	.Lprocess_block
532	lvdx_u	v26,0,$inp
533	addi	$inp,$inp,8
534	vperm	v26,v26,v27,v31
535	vxor	v4, v4, v26
536	bdz	.Lprocess_block
537	lvdx_u	v26,0,$inp
538	addi	$inp,$inp,8
539	vperm	v26,v26,v27,v30
540	vxor	v5, v5, v26
541	bdz	.Lprocess_block
542	lvdx_u	v26,0,$inp
543	addi	$inp,$inp,8
544	vperm	v26,v26,v27,v30
545	vxor	v6, v6, v26
546	bdz	.Lprocess_block
547	lvdx_u	v26,0,$inp
548	addi	$inp,$inp,8
549	vperm	v26,v26,v27,v30
550	vxor	v7, v7, v26
551	bdz	.Lprocess_block
552	lvdx_u	v26,0,$inp
553	addi	$inp,$inp,8
554	vperm	v26,v26,v27,v30
555	vxor	v8, v8, v26
556	bdz	.Lprocess_block
557	lvdx_u	v26,0,$inp
558	addi	$inp,$inp,8
559	vperm	v26,v26,v27,v30
560	vxor	v9, v9, v26
561	bdz	.Lprocess_block
562	lvdx_u	v26,0,$inp
563	addi	$inp,$inp,8
564	vperm	v26,v26,v27,v31
565	vxor	v5, v5, v26
566	bdz	.Lprocess_block
567	lvdx_u	v26,0,$inp
568	addi	$inp,$inp,8
569	vperm	v26,v26,v27,v31
570	vxor	v6, v6, v26
571	bdz	.Lprocess_block
572	lvdx_u	v26,0,$inp
573	addi	$inp,$inp,8
574	vperm	v26,v26,v27,v31
575	vxor	v7, v7, v26
576	bdz	.Lprocess_block
577	lvdx_u	v26,0,$inp
578	addi	$inp,$inp,8
579	vperm	v26,v26,v27,v31
580	vxor	v8, v8, v26
581	bdz	.Lprocess_block
582	lvdx_u	v26,0,$inp
583	addi	$inp,$inp,8
584	vperm	v26,v26,v27,v31
585	vxor	v9, v9, v26
586	bdz	.Lprocess_block
587	lvdx_u	v26,0,$inp
588	addi	$inp,$inp,8
589	vperm	v26,v26,v27,v30
590	vxor	v10, v10, v26
591	bdz	.Lprocess_block
592	lvdx_u	v26,0,$inp
593	addi	$inp,$inp,8
594	vperm	v26,v26,v27,v31
595	vxor	v10, v10, v26
596	bdz	.Lprocess_block
597	lvdx_u	v26,0,$inp
598	addi	$inp,$inp,8
599	vperm	v26,v26,v27,v30
600	vxor	v11, v11, v26
601	bdz	.Lprocess_block
602	lvdx_u	v26,0,$inp
603	addi	$inp,$inp,8
604	vperm	v26,v26,v27,v31
605	vxor	v11, v11, v26
606	bdz	.Lprocess_block
607	lvdx_u	v26,0,$inp
608	addi	$inp,$inp,8
609	vperm	v26,v26,v27,v31
610	vxor	v12, v12, v26
611
612.Lprocess_block:
613	bl	KeccakF1600_int
614
615	b	.Loop_absorb
616
617.align	4
618.Labsorbed:
619	li	r11,16
620	stvx_4w	v0,0,$A_jagged		; return A[5][5]
621	li	r10,32
622	stvx_4w	v1,r11,$A_jagged
623	addi	r11,r11,32
624	stvx_4w	v2,r10,$A_jagged
625	addi	r10,r10,32
626	stvx_4w	v3,r11,$A_jagged
627	addi	r11,r11,32
628	stvx_4w	v4,r10,$A_jagged
629	addi	r10,r10,32
630	stvx_4w	v5,r11,$A_jagged
631	addi	r11,r11,32
632	stvx_4w	v6,r10,$A_jagged
633	addi	r10,r10,32
634	stvx_4w	v7,r11,$A_jagged
635	addi	r11,r11,32
636	stvx_4w	v8,r10,$A_jagged
637	addi	r10,r10,32
638	stvx_4w	v9,r11,$A_jagged
639	addi	r11,r11,32
640	stvx_4w	v10,r10,$A_jagged
641	addi	r10,r10,32
642	stvx_4w	v11,r11,$A_jagged
643	stvdx_u v12,r10,$A_jagged
644
645	mr	r3,$len			; return value
646	li	r10,`15+6*$SIZE_T`
647	li	r11,`31+6*$SIZE_T`
648	mtlr	r8
649	mtspr	256, r7			; restore vrsave
650	lvx	v20,r10,$sp
651	addi	r10,r10,32
652	lvx	v21,r11,$sp
653	addi	r11,r11,32
654	lvx	v22,r10,$sp
655	addi	r10,r10,32
656	lvx	v23,r11,$sp
657	addi	r11,r11,32
658	lvx	v24,r10,$sp
659	addi	r10,r10,32
660	lvx	v25,r11,$sp
661	addi	r11,r11,32
662	lvx	v26,r10,$sp
663	addi	r10,r10,32
664	lvx	v27,r11,$sp
665	addi	r11,r11,32
666	lvx	v28,r10,$sp
667	addi	r10,r10,32
668	lvx	v29,r11,$sp
669	addi	r11,r11,32
670	lvx	v30,r10,$sp
671	lvx	v31,r11,$sp
672	addi	$sp,$sp,$FRAME
673	blr
674	.long	0
675	.byte	0,12,0x04,1,0x80,0,4,0
676	.long	0
677.size	SHA3_absorb,.-SHA3_absorb
678___
679}
680{
681my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
682
683$code.=<<___;
684.globl	SHA3_squeeze
685.type	SHA3_squeeze,\@function
686.align	5
687SHA3_squeeze:
688	mflr	r9			; r9 is not touched by KeccakF1600
689	subi	$out,$out,1		; prepare for stbu
690	addi	r8,$A_jagged,4		; prepare volatiles
691	mr	r10,$bsz
692	li	r11,0
693	b	.Loop_squeeze
694.align	4
695.Loop_squeeze:
696	lwzx	r7,r11,r8		; lo
697	lwzx	r0,r11,$A_jagged	; hi
698	${UCMP}i $len,8
699	blt	.Lsqueeze_tail
700
701	stbu	r7,1($out)		; write lo
702	srwi	r7,r7,8
703	stbu	r7,1($out)
704	srwi	r7,r7,8
705	stbu	r7,1($out)
706	srwi	r7,r7,8
707	stbu	r7,1($out)
708	stbu	r0,1($out)		; write hi
709	srwi	r0,r0,8
710	stbu	r0,1($out)
711	srwi	r0,r0,8
712	stbu	r0,1($out)
713	srwi	r0,r0,8
714	stbu	r0,1($out)
715
716	subic.	$len,$len,8
717	beqlr				; return if done
718
719	subic.	r10,r10,8
720	ble	.Loutput_expand
721
722	addi	r11,r11,16		; calculate jagged index
723	cmplwi	r11,`16*5`
724	blt	.Loop_squeeze
725	subi	r11,r11,72
726	beq	.Loop_squeeze
727	addi	r11,r11,72
728	cmplwi	r11,`16*5+8`
729	subi	r11,r11,8
730	beq	.Loop_squeeze
731	addi	r11,r11,8
732	cmplwi	r11,`16*10`
733	subi	r11,r11,72
734	beq	.Loop_squeeze
735	addi	r11,r11,72
736	blt	.Loop_squeeze
737	subi	r11,r11,8
738	b	.Loop_squeeze
739
740.align	4
741.Loutput_expand:
742	bl	KeccakF1600
743	mtlr	r9
744
745	addi	r8,$A_jagged,4		; restore volatiles
746	mr	r10,$bsz
747	li	r11,0
748	b	.Loop_squeeze
749
750.align	4
751.Lsqueeze_tail:
752	mtctr	$len
753	subic.	$len,$len,4
754	ble	.Loop_tail_lo
755	li	r8,4
756	mtctr	r8
757.Loop_tail_lo:
758	stbu	r7,1($out)
759	srdi	r7,r7,8
760	bdnz	.Loop_tail_lo
761	ble	.Lsqueeze_done
762	mtctr	$len
763.Loop_tail_hi:
764	stbu	r0,1($out)
765	srdi	r0,r0,8
766	bdnz	.Loop_tail_hi
767
768.Lsqueeze_done:
769	blr
770	.long	0
771	.byte	0,12,0x14,0,0,0,4,0
772	.long	0
773.size	SHA3_squeeze,.-SHA3_squeeze
774___
775}
776$code.=<<___;
777.align	6
778PICmeup:
779	mflr	r0
780	bcl	20,31,\$+4
781	mflr	r12   ; vvvvvv "distance" between . and 1st data entry
782	addi	r12,r12,`64-8`
783	mtlr	r0
784	blr
785	.long	0
786	.byte	0,12,0x14,0,0,0,0,0
787	.space	`64-9*4`
788.type	rhotates,\@object
789.align	6
790rhotates:
791	.quad	0,  36
792	.quad	1,  44
793	.quad	62,  6
794	.quad	28, 55
795	.quad	27, 20
796	.quad	3,  41
797	.quad	10, 45
798	.quad	43, 15
799	.quad	25, 21
800	.quad	39,  8
801	.quad	18,  2
802	.quad	61, 56
803	.quad	14, 14
804.size	rhotates,.-rhotates
805	.quad	0,0
806	.quad	0x0001020304050607,0x1011121314151617
807	.quad	0x1011121314151617,0x0001020304050607
808.type	iotas,\@object
809iotas:
810	.quad	0x0000000000000001,0
811	.quad	0x0000000000008082,0
812	.quad	0x800000000000808a,0
813	.quad	0x8000000080008000,0
814	.quad	0x000000000000808b,0
815	.quad	0x0000000080000001,0
816	.quad	0x8000000080008081,0
817	.quad	0x8000000000008009,0
818	.quad	0x000000000000008a,0
819	.quad	0x0000000000000088,0
820	.quad	0x0000000080008009,0
821	.quad	0x000000008000000a,0
822	.quad	0x000000008000808b,0
823	.quad	0x800000000000008b,0
824	.quad	0x8000000000008089,0
825	.quad	0x8000000000008003,0
826	.quad	0x8000000000008002,0
827	.quad	0x8000000000000080,0
828	.quad	0x000000000000800a,0
829	.quad	0x800000008000000a,0
830	.quad	0x8000000080008081,0
831	.quad	0x8000000000008080,0
832	.quad	0x0000000080000001,0
833	.quad	0x8000000080008008,0
834.size	iotas,.-iotas
835.asciz	"Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
836___
837
838foreach  (split("\n",$code)) {
839	s/\`([^\`]*)\`/eval $1/ge;
840
841	if ($flavour =~ /le$/) {	# little-endian
842	    s/\?([a-z]+)/;$1/;
843	} else {			# big-endian
844	    s/\?([a-z]+)/$1/;
845	}
846
847	print $_,"\n";
848}
849
850close STDOUT or die "error closing STDOUT: $!";
851