1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# SHA256/512 for PowerISA v2.07.
18#
19# Accurate performance measurements are problematic, because it's
20# always virtualized setup with possibly throttled processor.
21# Relative comparison is therefore more informative. This module is
22# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
23# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
24# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
25# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
26# result is degree of computational resources' utilization. POWER8 is
27# "massively multi-threaded chip" and difference between single- and
28# maximum multi-process benchmark results tells that utilization is
29# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
30# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
31# to single-process one, given that all threads end up on the same
32# physical core.
33#
34######################################################################
35# Believed-to-be-accurate results in cycles per processed byte [on
36# little-endian system]. Numbers in square brackets are for 64-bit
37# build of sha512-ppc.pl, presented for reference.
38#
39#		POWER8		POWER9
40# SHA256	9.7 [15.8]	11.2 [12.5]
41# SHA512	6.1 [10.3]	7.0 [7.9]
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48if ($flavour =~ /64/) {
49	$SIZE_T=8;
50	$LRSAVE=2*$SIZE_T;
51	$STU="stdu";
52	$POP="ld";
53	$PUSH="std";
54} elsif ($flavour =~ /32/) {
55	$SIZE_T=4;
56	$LRSAVE=$SIZE_T;
57	$STU="stwu";
58	$POP="lwz";
59	$PUSH="stw";
60} else { die "nonsense $flavour"; }
61
62$LENDIAN=($flavour=~/le/);
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
67die "can't locate ppc-xlate.pl";
68
69open STDOUT,"| $^X $xlate $flavour \"$output\""
70    or die "can't call $xlate: $!";
71
72if ($output =~ /512/) {
73	$bits=512;
74	$SZ=8;
75	$sz="d";
76	$rounds=80;
77} else {
78	$bits=256;
79	$SZ=4;
80	$sz="w";
81	$rounds=64;
82}
83
84$func="sha${bits}_block_p8";
85$LOCALS=8*$SIZE_T+8*16;
86$FRAME=$LOCALS+9*16+6*$SIZE_T;
87
88$sp ="r1";
89$toc="r2";
90$ctx="r3";
91$inp="r4";
92$num="r5";
93$Tbl="r6";
94$idx="r7";
95$lrsave="r8";
96$offload="r11";
97$vrsave="r12";
98@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
99
100@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
101@X=map("v$_",(8..19,24..27));
102($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
103
104sub ROUND {
105my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
106my $j=($i+1)%16;
107my $k=($i+2)%8;
108
109$code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
110	lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
111	addi		$inp,$inp,16
112___
113$code.=<<___		if ($i<16 && ($i%(16/$SZ)));
114	vsldoi		@X[$i],@X[$i-1],@X[$i-1],$SZ
115___
116$code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
117	vperm		@X[$i],@X[$i],@X[$i],$lemask
118___
119$code.=<<___		if ($i>=15);
120	vshasigma${sz}	$Sigma,@X[($j+1)%16],0,0
121	vaddu${sz}m	@X[$j],@X[$j],$Sigma
122	vshasigma${sz}	$Sigma,@X[($j+14)%16],0,15
123	vaddu${sz}m	@X[$j],@X[$j],$Sigma
124	vaddu${sz}m	@X[$j],@X[$j],@X[($j+9)%16]
125___
126$code.=<<___;
127	vaddu${sz}m	$h,$h,@X[$i%16]		; h+=X[i]
128	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
129	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
130	vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
131	vshasigma${sz}	$Sigma,$e,1,15		; Sigma1(e)
132	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma1(e)
133	vxor		$Func,$a,$b
134	vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
135	vaddu${sz}m	$d,$d,$h		; d+=h
136	vshasigma${sz}	$Sigma,$a,1,0		; Sigma0(a)
137	vaddu${sz}m	$Sigma,$Sigma,$Func	; Sigma0(a)+Maj(a,b,c)
138	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma0(a)+Maj(a,b,c)
139	lvx		$Ki,@I[$k],$idx		; load next K[i]
140___
141$code.=<<___		if ($k == 7);
142	addi		$idx,$idx,0x80
143___
144}
145
146$code=<<___;
147.machine	"any"
148.text
149
150.globl	$func
151.align	6
152$func:
153	$STU		$sp,-$FRAME($sp)
154	mflr		$lrsave
155	li		r10,`$LOCALS+15`
156	li		r11,`$LOCALS+31`
157	stvx		v24,r10,$sp		# ABI says so
158	addi		r10,r10,32
159	mfspr		$vrsave,256
160	stvx		v25,r11,$sp
161	addi		r11,r11,32
162	stvx		v26,r10,$sp
163	addi		r10,r10,32
164	stvx		v27,r11,$sp
165	addi		r11,r11,32
166	stvx		v28,r10,$sp
167	addi		r10,r10,32
168	stvx		v29,r11,$sp
169	addi		r11,r11,32
170	stvx		v30,r10,$sp
171	stvx		v31,r11,$sp
172	li		r11,-4096+255		# 0xfffff0ff
173	stw		$vrsave,`$FRAME-6*$SIZE_T-4`($sp)	# save vrsave
174	li		$x10,0x10
175	$PUSH		r26,`$FRAME-6*$SIZE_T`($sp)
176	li		$x20,0x20
177	$PUSH		r27,`$FRAME-5*$SIZE_T`($sp)
178	li		$x30,0x30
179	$PUSH		r28,`$FRAME-4*$SIZE_T`($sp)
180	li		$x40,0x40
181	$PUSH		r29,`$FRAME-3*$SIZE_T`($sp)
182	li		$x50,0x50
183	$PUSH		r30,`$FRAME-2*$SIZE_T`($sp)
184	li		$x60,0x60
185	$PUSH		r31,`$FRAME-1*$SIZE_T`($sp)
186	li		$x70,0x70
187	$PUSH		$lrsave,`$FRAME+$LRSAVE`($sp)
188	mtspr		256,r11
189
190	bl		LPICmeup
191	addi		$offload,$sp,`8*$SIZE_T+15`
192___
193$code.=<<___		if ($LENDIAN);
194	li		$idx,8
195	lvsl		$lemask,0,$idx
196	vspltisb	$Ki,0x0f
197	vxor		$lemask,$lemask,$Ki
198___
199$code.=<<___		if ($SZ==4);
200	lvx_4w		$A,$x00,$ctx
201	lvx_4w		$E,$x10,$ctx
202	vsldoi		$B,$A,$A,4		# unpack
203	vsldoi		$C,$A,$A,8
204	vsldoi		$D,$A,$A,12
205	vsldoi		$F,$E,$E,4
206	vsldoi		$G,$E,$E,8
207	vsldoi		$H,$E,$E,12
208___
209$code.=<<___		if ($SZ==8);
210	lvx_u		$A,$x00,$ctx
211	lvx_u		$C,$x10,$ctx
212	lvx_u		$E,$x20,$ctx
213	vsldoi		$B,$A,$A,8		# unpack
214	lvx_u		$G,$x30,$ctx
215	vsldoi		$D,$C,$C,8
216	vsldoi		$F,$E,$E,8
217	vsldoi		$H,$G,$G,8
218___
219$code.=<<___;
220	li		r0,`($rounds-16)/16`	# inner loop counter
221	b		Loop
222.align	5
223Loop:
224	lvx		$Ki,$x00,$Tbl
225	lvx_u		@X[0],0,$inp
226	addi		$inp,$inp,16
227	mr		$idx,$Tbl		# copy $Tbl
228	stvx		$A,$x00,$offload	# offload $A-$H
229	stvx		$B,$x10,$offload
230	stvx		$C,$x20,$offload
231	stvx		$D,$x30,$offload
232	stvx		$E,$x40,$offload
233	stvx		$F,$x50,$offload
234	stvx		$G,$x60,$offload
235	stvx		$H,$x70,$offload
236	vaddu${sz}m	$H,$H,$Ki		# h+K[i]
237	lvx		$Ki,$x10,$Tbl
238___
239for ($i=0;$i<16;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
240$code.=<<___;
241	mtctr		r0
242	b		L16_xx
243.align	5
244L16_xx:
245___
246for (;$i<32;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
247$code.=<<___;
248	bdnz		L16_xx
249
250	lvx		@X[2],$x00,$offload
251	subic.		$num,$num,1
252	lvx		@X[3],$x10,$offload
253	vaddu${sz}m	$A,$A,@X[2]
254	lvx		@X[4],$x20,$offload
255	vaddu${sz}m	$B,$B,@X[3]
256	lvx		@X[5],$x30,$offload
257	vaddu${sz}m	$C,$C,@X[4]
258	lvx		@X[6],$x40,$offload
259	vaddu${sz}m	$D,$D,@X[5]
260	lvx		@X[7],$x50,$offload
261	vaddu${sz}m	$E,$E,@X[6]
262	lvx		@X[8],$x60,$offload
263	vaddu${sz}m	$F,$F,@X[7]
264	lvx		@X[9],$x70,$offload
265	vaddu${sz}m	$G,$G,@X[8]
266	vaddu${sz}m	$H,$H,@X[9]
267	bne		Loop
268___
269$code.=<<___		if ($SZ==4);
270	lvx		@X[0],$x20,$idx
271	vperm		$A,$A,$B,$Ki		# pack the answer
272	lvx		@X[1],$x30,$idx
273	vperm		$E,$E,$F,$Ki
274	vperm		$A,$A,$C,@X[0]
275	vperm		$E,$E,$G,@X[0]
276	vperm		$A,$A,$D,@X[1]
277	vperm		$E,$E,$H,@X[1]
278	stvx_4w		$A,$x00,$ctx
279	stvx_4w		$E,$x10,$ctx
280___
281$code.=<<___		if ($SZ==8);
282	vperm		$A,$A,$B,$Ki		# pack the answer
283	vperm		$C,$C,$D,$Ki
284	vperm		$E,$E,$F,$Ki
285	vperm		$G,$G,$H,$Ki
286	stvx_u		$A,$x00,$ctx
287	stvx_u		$C,$x10,$ctx
288	stvx_u		$E,$x20,$ctx
289	stvx_u		$G,$x30,$ctx
290___
291$code.=<<___;
292	addi		$offload,$sp,`$LOCALS+15`
293	mtlr		$lrsave
294	mtspr		256,$vrsave
295	lvx		v24,$x00,$offload	# ABI says so
296	lvx		v25,$x10,$offload
297	lvx		v26,$x20,$offload
298	lvx		v27,$x30,$offload
299	lvx		v28,$x40,$offload
300	lvx		v29,$x50,$offload
301	lvx		v30,$x60,$offload
302	lvx		v31,$x70,$offload
303	$POP		r26,`$FRAME-6*$SIZE_T`($sp)
304	$POP		r27,`$FRAME-5*$SIZE_T`($sp)
305	$POP		r28,`$FRAME-4*$SIZE_T`($sp)
306	$POP		r29,`$FRAME-3*$SIZE_T`($sp)
307	$POP		r30,`$FRAME-2*$SIZE_T`($sp)
308	$POP		r31,`$FRAME-1*$SIZE_T`($sp)
309	addi		$sp,$sp,$FRAME
310	blr
311	.long		0
312	.byte		0,12,4,1,0x80,6,3,0
313	.long		0
314.size	$func,.-$func
315___
316
317# Ugly hack here, because PPC assembler syntax seem to vary too
318# much from platforms to platform...
319$code.=<<___;
320.align	6
321LPICmeup:
322	mflr	r0
323	bcl	20,31,\$+4
324	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
325	addi	$Tbl,$Tbl,`64-8`
326	mtlr	r0
327	blr
328	.long	0
329	.byte	0,12,0x14,0,0,0,0,0
330	.space	`64-9*4`
331___
332
333if ($SZ==8) {
334    local *table = sub {
335	foreach(@_) { $code.=".quad	$_,$_\n"; }
336    };
337    table(
338	"0x428a2f98d728ae22","0x7137449123ef65cd",
339	"0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
340	"0x3956c25bf348b538","0x59f111f1b605d019",
341	"0x923f82a4af194f9b","0xab1c5ed5da6d8118",
342	"0xd807aa98a3030242","0x12835b0145706fbe",
343	"0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
344	"0x72be5d74f27b896f","0x80deb1fe3b1696b1",
345	"0x9bdc06a725c71235","0xc19bf174cf692694",
346	"0xe49b69c19ef14ad2","0xefbe4786384f25e3",
347	"0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
348	"0x2de92c6f592b0275","0x4a7484aa6ea6e483",
349	"0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
350	"0x983e5152ee66dfab","0xa831c66d2db43210",
351	"0xb00327c898fb213f","0xbf597fc7beef0ee4",
352	"0xc6e00bf33da88fc2","0xd5a79147930aa725",
353	"0x06ca6351e003826f","0x142929670a0e6e70",
354	"0x27b70a8546d22ffc","0x2e1b21385c26c926",
355	"0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
356	"0x650a73548baf63de","0x766a0abb3c77b2a8",
357	"0x81c2c92e47edaee6","0x92722c851482353b",
358	"0xa2bfe8a14cf10364","0xa81a664bbc423001",
359	"0xc24b8b70d0f89791","0xc76c51a30654be30",
360	"0xd192e819d6ef5218","0xd69906245565a910",
361	"0xf40e35855771202a","0x106aa07032bbd1b8",
362	"0x19a4c116b8d2d0c8","0x1e376c085141ab53",
363	"0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
364	"0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
365	"0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
366	"0x748f82ee5defb2fc","0x78a5636f43172f60",
367	"0x84c87814a1f0ab72","0x8cc702081a6439ec",
368	"0x90befffa23631e28","0xa4506cebde82bde9",
369	"0xbef9a3f7b2c67915","0xc67178f2e372532b",
370	"0xca273eceea26619c","0xd186b8c721c0c207",
371	"0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
372	"0x06f067aa72176fba","0x0a637dc5a2c898a6",
373	"0x113f9804bef90dae","0x1b710b35131c471b",
374	"0x28db77f523047d84","0x32caab7b40c72493",
375	"0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
376	"0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
377	"0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
378$code.=<<___	if (!$LENDIAN);
379.quad	0x0001020304050607,0x1011121314151617
380___
381$code.=<<___	if ($LENDIAN);	# quad-swapped
382.quad	0x1011121314151617,0x0001020304050607
383___
384} else {
385    local *table = sub {
386	foreach(@_) { $code.=".long	$_,$_,$_,$_\n"; }
387    };
388    table(
389	"0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
390	"0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
391	"0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
392	"0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
393	"0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
394	"0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
395	"0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
396	"0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
397	"0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
398	"0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
399	"0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
400	"0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
401	"0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
402	"0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
403	"0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
404	"0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
405$code.=<<___	if (!$LENDIAN);
406.long	0x00010203,0x10111213,0x10111213,0x10111213
407.long	0x00010203,0x04050607,0x10111213,0x10111213
408.long	0x00010203,0x04050607,0x08090a0b,0x10111213
409___
410$code.=<<___	if ($LENDIAN);	# word-swapped
411.long	0x10111213,0x10111213,0x10111213,0x00010203
412.long	0x10111213,0x10111213,0x04050607,0x00010203
413.long	0x10111213,0x08090a0b,0x04050607,0x00010203
414___
415}
416$code.=<<___;
417.asciz	"SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
418.align	2
419___
420
421$code =~ s/\`([^\`]*)\`/eval $1/gem;
422print $code;
423close STDOUT or die "error closing STDOUT: $!";
424