1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# SHA256/512 for PowerISA v2.07.
18#
19# Accurate performance measurements are problematic, because it's
20# always virtualized setup with possibly throttled processor.
21# Relative comparison is therefore more informative. This module is
22# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
23# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
24# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
25# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
26# result is degree of computational resources' utilization. POWER8 is
27# "massively multi-threaded chip" and difference between single- and
28# maximum multi-process benchmark results tells that utilization is
29# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
30# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
31# to single-process one, given that all threads end up on the same
32# physical core.
33#
34######################################################################
35# Believed-to-be-accurate results in cycles per processed byte [on
36# little-endian system]. Numbers in square brackets are for 64-bit
37# build of sha512-ppc.pl, presented for reference.
38#
39#		POWER8		POWER9
40# SHA256	9.7 [15.8]	11.2 [12.5]
41# SHA512	6.1 [10.3]	7.0 [7.9]
42
43$flavour=shift;
44$output =shift;
45
46if ($flavour =~ /64/) {
47	$SIZE_T=8;
48	$LRSAVE=2*$SIZE_T;
49	$STU="stdu";
50	$POP="ld";
51	$PUSH="std";
52} elsif ($flavour =~ /32/) {
53	$SIZE_T=4;
54	$LRSAVE=$SIZE_T;
55	$STU="stwu";
56	$POP="lwz";
57	$PUSH="stw";
58} else { die "nonsense $flavour"; }
59
60$LENDIAN=($flavour=~/le/);
61
62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65die "can't locate ppc-xlate.pl";
66
67open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
68
69if ($output =~ /512/) {
70	$bits=512;
71	$SZ=8;
72	$sz="d";
73	$rounds=80;
74} else {
75	$bits=256;
76	$SZ=4;
77	$sz="w";
78	$rounds=64;
79}
80
81$func="sha${bits}_block_p8";
82$LOCALS=8*$SIZE_T+8*16;
83$FRAME=$LOCALS+9*16+6*$SIZE_T;
84
85$sp ="r1";
86$toc="r2";
87$ctx="r3";
88$inp="r4";
89$num="r5";
90$Tbl="r6";
91$idx="r7";
92$lrsave="r8";
93$offload="r11";
94$vrsave="r12";
95@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
96
97@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
98@X=map("v$_",(8..19,24..27));
99($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
100
101sub ROUND {
102my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
103my $j=($i+1)%16;
104my $k=($i+2)%8;
105
106$code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
107	lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
108	addi		$inp,$inp,16
109___
110$code.=<<___		if ($i<16 && ($i%(16/$SZ)));
111	vsldoi		@X[$i],@X[$i-1],@X[$i-1],$SZ
112___
113$code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
114	vperm		@X[$i],@X[$i],@X[$i],$lemask
115___
116$code.=<<___		if ($i>=15);
117	vshasigma${sz}	$Sigma,@X[($j+1)%16],0,0
118	vaddu${sz}m	@X[$j],@X[$j],$Sigma
119	vshasigma${sz}	$Sigma,@X[($j+14)%16],0,15
120	vaddu${sz}m	@X[$j],@X[$j],$Sigma
121	vaddu${sz}m	@X[$j],@X[$j],@X[($j+9)%16]
122___
123$code.=<<___;
124	vaddu${sz}m	$h,$h,@X[$i%16]		; h+=X[i]
125	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
126	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
127	vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
128	vshasigma${sz}	$Sigma,$e,1,15		; Sigma1(e)
129	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma1(e)
130	vxor		$Func,$a,$b
131	vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
132	vaddu${sz}m	$d,$d,$h		; d+=h
133	vshasigma${sz}	$Sigma,$a,1,0		; Sigma0(a)
134	vaddu${sz}m	$Sigma,$Sigma,$Func	; Sigma0(a)+Maj(a,b,c)
135	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma0(a)+Maj(a,b,c)
136	lvx		$Ki,@I[$k],$idx		; load next K[i]
137___
138$code.=<<___		if ($k == 7);
139	addi		$idx,$idx,0x80
140___
141}
142
143$code=<<___;
144.machine	"any"
145.text
146
147.globl	$func
148.align	6
149$func:
150	$STU		$sp,-$FRAME($sp)
151	mflr		$lrsave
152	li		r10,`$LOCALS+15`
153	li		r11,`$LOCALS+31`
154	stvx		v24,r10,$sp		# ABI says so
155	addi		r10,r10,32
156	mfspr		$vrsave,256
157	stvx		v25,r11,$sp
158	addi		r11,r11,32
159	stvx		v26,r10,$sp
160	addi		r10,r10,32
161	stvx		v27,r11,$sp
162	addi		r11,r11,32
163	stvx		v28,r10,$sp
164	addi		r10,r10,32
165	stvx		v29,r11,$sp
166	addi		r11,r11,32
167	stvx		v30,r10,$sp
168	stvx		v31,r11,$sp
169	li		r11,-4096+255		# 0xfffff0ff
170	stw		$vrsave,`$FRAME-6*$SIZE_T-4`($sp)	# save vrsave
171	li		$x10,0x10
172	$PUSH		r26,`$FRAME-6*$SIZE_T`($sp)
173	li		$x20,0x20
174	$PUSH		r27,`$FRAME-5*$SIZE_T`($sp)
175	li		$x30,0x30
176	$PUSH		r28,`$FRAME-4*$SIZE_T`($sp)
177	li		$x40,0x40
178	$PUSH		r29,`$FRAME-3*$SIZE_T`($sp)
179	li		$x50,0x50
180	$PUSH		r30,`$FRAME-2*$SIZE_T`($sp)
181	li		$x60,0x60
182	$PUSH		r31,`$FRAME-1*$SIZE_T`($sp)
183	li		$x70,0x70
184	$PUSH		$lrsave,`$FRAME+$LRSAVE`($sp)
185	mtspr		256,r11
186
187	bl		LPICmeup
188	addi		$offload,$sp,`8*$SIZE_T+15`
189___
190$code.=<<___		if ($LENDIAN);
191	li		$idx,8
192	lvsl		$lemask,0,$idx
193	vspltisb	$Ki,0x0f
194	vxor		$lemask,$lemask,$Ki
195___
196$code.=<<___		if ($SZ==4);
197	lvx_4w		$A,$x00,$ctx
198	lvx_4w		$E,$x10,$ctx
199	vsldoi		$B,$A,$A,4		# unpack
200	vsldoi		$C,$A,$A,8
201	vsldoi		$D,$A,$A,12
202	vsldoi		$F,$E,$E,4
203	vsldoi		$G,$E,$E,8
204	vsldoi		$H,$E,$E,12
205___
206$code.=<<___		if ($SZ==8);
207	lvx_u		$A,$x00,$ctx
208	lvx_u		$C,$x10,$ctx
209	lvx_u		$E,$x20,$ctx
210	vsldoi		$B,$A,$A,8		# unpack
211	lvx_u		$G,$x30,$ctx
212	vsldoi		$D,$C,$C,8
213	vsldoi		$F,$E,$E,8
214	vsldoi		$H,$G,$G,8
215___
216$code.=<<___;
217	li		r0,`($rounds-16)/16`	# inner loop counter
218	b		Loop
219.align	5
220Loop:
221	lvx		$Ki,$x00,$Tbl
222	lvx_u		@X[0],0,$inp
223	addi		$inp,$inp,16
224	mr		$idx,$Tbl		# copy $Tbl
225	stvx		$A,$x00,$offload	# offload $A-$H
226	stvx		$B,$x10,$offload
227	stvx		$C,$x20,$offload
228	stvx		$D,$x30,$offload
229	stvx		$E,$x40,$offload
230	stvx		$F,$x50,$offload
231	stvx		$G,$x60,$offload
232	stvx		$H,$x70,$offload
233	vaddu${sz}m	$H,$H,$Ki		# h+K[i]
234	lvx		$Ki,$x10,$Tbl
235___
236for ($i=0;$i<16;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
237$code.=<<___;
238	mtctr		r0
239	b		L16_xx
240.align	5
241L16_xx:
242___
243for (;$i<32;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
244$code.=<<___;
245	bdnz		L16_xx
246
247	lvx		@X[2],$x00,$offload
248	subic.		$num,$num,1
249	lvx		@X[3],$x10,$offload
250	vaddu${sz}m	$A,$A,@X[2]
251	lvx		@X[4],$x20,$offload
252	vaddu${sz}m	$B,$B,@X[3]
253	lvx		@X[5],$x30,$offload
254	vaddu${sz}m	$C,$C,@X[4]
255	lvx		@X[6],$x40,$offload
256	vaddu${sz}m	$D,$D,@X[5]
257	lvx		@X[7],$x50,$offload
258	vaddu${sz}m	$E,$E,@X[6]
259	lvx		@X[8],$x60,$offload
260	vaddu${sz}m	$F,$F,@X[7]
261	lvx		@X[9],$x70,$offload
262	vaddu${sz}m	$G,$G,@X[8]
263	vaddu${sz}m	$H,$H,@X[9]
264	bne		Loop
265___
266$code.=<<___		if ($SZ==4);
267	lvx		@X[0],$x20,$idx
268	vperm		$A,$A,$B,$Ki		# pack the answer
269	lvx		@X[1],$x30,$idx
270	vperm		$E,$E,$F,$Ki
271	vperm		$A,$A,$C,@X[0]
272	vperm		$E,$E,$G,@X[0]
273	vperm		$A,$A,$D,@X[1]
274	vperm		$E,$E,$H,@X[1]
275	stvx_4w		$A,$x00,$ctx
276	stvx_4w		$E,$x10,$ctx
277___
278$code.=<<___		if ($SZ==8);
279	vperm		$A,$A,$B,$Ki		# pack the answer
280	vperm		$C,$C,$D,$Ki
281	vperm		$E,$E,$F,$Ki
282	vperm		$G,$G,$H,$Ki
283	stvx_u		$A,$x00,$ctx
284	stvx_u		$C,$x10,$ctx
285	stvx_u		$E,$x20,$ctx
286	stvx_u		$G,$x30,$ctx
287___
288$code.=<<___;
289	addi		$offload,$sp,`$LOCALS+15`
290	mtlr		$lrsave
291	mtspr		256,$vrsave
292	lvx		v24,$x00,$offload	# ABI says so
293	lvx		v25,$x10,$offload
294	lvx		v26,$x20,$offload
295	lvx		v27,$x30,$offload
296	lvx		v28,$x40,$offload
297	lvx		v29,$x50,$offload
298	lvx		v30,$x60,$offload
299	lvx		v31,$x70,$offload
300	$POP		r26,`$FRAME-6*$SIZE_T`($sp)
301	$POP		r27,`$FRAME-5*$SIZE_T`($sp)
302	$POP		r28,`$FRAME-4*$SIZE_T`($sp)
303	$POP		r29,`$FRAME-3*$SIZE_T`($sp)
304	$POP		r30,`$FRAME-2*$SIZE_T`($sp)
305	$POP		r31,`$FRAME-1*$SIZE_T`($sp)
306	addi		$sp,$sp,$FRAME
307	blr
308	.long		0
309	.byte		0,12,4,1,0x80,6,3,0
310	.long		0
311.size	$func,.-$func
312___
313
314# Ugly hack here, because PPC assembler syntax seem to vary too
315# much from platforms to platform...
316$code.=<<___;
317.align	6
318LPICmeup:
319	mflr	r0
320	bcl	20,31,\$+4
321	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
322	addi	$Tbl,$Tbl,`64-8`
323	mtlr	r0
324	blr
325	.long	0
326	.byte	0,12,0x14,0,0,0,0,0
327	.space	`64-9*4`
328___
329
330if ($SZ==8) {
331    local *table = sub {
332	foreach(@_) { $code.=".quad	$_,$_\n"; }
333    };
334    table(
335	"0x428a2f98d728ae22","0x7137449123ef65cd",
336	"0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
337	"0x3956c25bf348b538","0x59f111f1b605d019",
338	"0x923f82a4af194f9b","0xab1c5ed5da6d8118",
339	"0xd807aa98a3030242","0x12835b0145706fbe",
340	"0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
341	"0x72be5d74f27b896f","0x80deb1fe3b1696b1",
342	"0x9bdc06a725c71235","0xc19bf174cf692694",
343	"0xe49b69c19ef14ad2","0xefbe4786384f25e3",
344	"0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
345	"0x2de92c6f592b0275","0x4a7484aa6ea6e483",
346	"0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
347	"0x983e5152ee66dfab","0xa831c66d2db43210",
348	"0xb00327c898fb213f","0xbf597fc7beef0ee4",
349	"0xc6e00bf33da88fc2","0xd5a79147930aa725",
350	"0x06ca6351e003826f","0x142929670a0e6e70",
351	"0x27b70a8546d22ffc","0x2e1b21385c26c926",
352	"0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
353	"0x650a73548baf63de","0x766a0abb3c77b2a8",
354	"0x81c2c92e47edaee6","0x92722c851482353b",
355	"0xa2bfe8a14cf10364","0xa81a664bbc423001",
356	"0xc24b8b70d0f89791","0xc76c51a30654be30",
357	"0xd192e819d6ef5218","0xd69906245565a910",
358	"0xf40e35855771202a","0x106aa07032bbd1b8",
359	"0x19a4c116b8d2d0c8","0x1e376c085141ab53",
360	"0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
361	"0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
362	"0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
363	"0x748f82ee5defb2fc","0x78a5636f43172f60",
364	"0x84c87814a1f0ab72","0x8cc702081a6439ec",
365	"0x90befffa23631e28","0xa4506cebde82bde9",
366	"0xbef9a3f7b2c67915","0xc67178f2e372532b",
367	"0xca273eceea26619c","0xd186b8c721c0c207",
368	"0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
369	"0x06f067aa72176fba","0x0a637dc5a2c898a6",
370	"0x113f9804bef90dae","0x1b710b35131c471b",
371	"0x28db77f523047d84","0x32caab7b40c72493",
372	"0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
373	"0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
374	"0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
375$code.=<<___	if (!$LENDIAN);
376.quad	0x0001020304050607,0x1011121314151617
377___
378$code.=<<___	if ($LENDIAN);	# quad-swapped
379.quad	0x1011121314151617,0x0001020304050607
380___
381} else {
382    local *table = sub {
383	foreach(@_) { $code.=".long	$_,$_,$_,$_\n"; }
384    };
385    table(
386	"0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
387	"0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
388	"0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
389	"0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
390	"0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
391	"0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
392	"0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
393	"0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
394	"0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
395	"0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
396	"0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
397	"0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
398	"0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
399	"0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
400	"0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
401	"0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
402$code.=<<___	if (!$LENDIAN);
403.long	0x00010203,0x10111213,0x10111213,0x10111213
404.long	0x00010203,0x04050607,0x10111213,0x10111213
405.long	0x00010203,0x04050607,0x08090a0b,0x10111213
406___
407$code.=<<___	if ($LENDIAN);	# word-swapped
408.long	0x10111213,0x10111213,0x10111213,0x00010203
409.long	0x10111213,0x10111213,0x04050607,0x00010203
410.long	0x10111213,0x08090a0b,0x04050607,0x00010203
411___
412}
413$code.=<<___;
414.asciz	"SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
415.align	2
416___
417
418$code =~ s/\`([^\`]*)\`/eval $1/gem;
419print $code;
420close STDOUT or die "error closing STDOUT: $!";
421