1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 for PowerISA v2.07.
11#
12# Accurate performance measurements are problematic, because it's
13# always virtualized setup with possibly throttled processor.
14# Relative comparison is therefore more informative. This module is
15# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
16# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
17# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
18# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
19# result is degree of computational resources' utilization. POWER8 is
20# "massively multi-threaded chip" and difference between single- and
21# maximum multi-process benchmark results tells that utlization is
22# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
23# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
24# to single-process one, given that all threads end up on the same
25# physical core.
26
27$flavour=shift;
28$output =shift;
29
30if ($flavour =~ /64/) {
31	$SIZE_T=8;
32	$LRSAVE=2*$SIZE_T;
33	$STU="stdu";
34	$POP="ld";
35	$PUSH="std";
36} elsif ($flavour =~ /32/) {
37	$SIZE_T=4;
38	$LRSAVE=$SIZE_T;
39	$STU="stwu";
40	$POP="lwz";
41	$PUSH="stw";
42} else { die "nonsense $flavour"; }
43
44$LENDIAN=($flavour=~/le/);
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49die "can't locate ppc-xlate.pl";
50
51open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
52
53if ($output =~ /512/) {
54	$bits=512;
55	$SZ=8;
56	$sz="d";
57	$rounds=80;
58} else {
59	$bits=256;
60	$SZ=4;
61	$sz="w";
62	$rounds=64;
63}
64
65$func="sha${bits}_block_p8";
66$FRAME=8*$SIZE_T;
67
68$sp ="r1";
69$toc="r2";
70$ctx="r3";
71$inp="r4";
72$num="r5";
73$Tbl="r6";
74$idx="r7";
75$lrsave="r8";
76$offload="r11";
77$vrsave="r12";
78($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
79 $x00=0 if ($flavour =~ /osx/);
80
81@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
82@X=map("v$_",(8..23));
83($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
84
85sub ROUND {
86my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
87my $j=($i+1)%16;
88
89$code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
90	lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
91	addi		$inp,$inp,16
92___
93$code.=<<___		if ($i<16 && ($i%(16/$SZ)));
94	vsldoi		@X[$i],@X[$i-1],@X[$i-1],$SZ
95___
96$code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
97	vperm		@X[$i],@X[$i],@X[$i],$lemask
98___
99$code.=<<___;
100	`"vshasigma${sz}	$s0,@X[($j+1)%16],0,0"		if ($i>=15)`
101	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
102	vshasigma${sz}	$S1,$e,1,15		; Sigma1(e)
103	vaddu${sz}m	$h,$h,@X[$i%16]		; h+=X[i]
104	vshasigma${sz}	$S0,$a,1,0		; Sigma0(a)
105	`"vshasigma${sz}	$s1,@X[($j+14)%16],0,15"	if ($i>=15)`
106	vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
107	vxor		$Func,$a,$b
108	`"vaddu${sz}m		@X[$j],@X[$j],@X[($j+9)%16]"	if ($i>=15)`
109	vaddu${sz}m	$h,$h,$S1		; h+=Sigma1(e)
110	vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
111	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
112	vaddu${sz}m	$d,$d,$h		; d+=h
113	vaddu${sz}m	$S0,$S0,$Func		; Sigma0(a)+Maj(a,b,c)
114	`"vaddu${sz}m		@X[$j],@X[$j],$s0"		if ($i>=15)`
115	lvx		$Ki,$idx,$Tbl		; load next K[i]
116	addi		$idx,$idx,16
117	vaddu${sz}m	$h,$h,$S0		; h+=Sigma0(a)+Maj(a,b,c)
118	`"vaddu${sz}m		@X[$j],@X[$j],$s1"		if ($i>=15)`
119___
120}
121
122$code=<<___;
123.machine	"any"
124.text
125
126.globl	$func
127.align	6
128$func:
129	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
130	mflr		$lrsave
131	li		r10,`$FRAME+8*16+15`
132	li		r11,`$FRAME+8*16+31`
133	stvx		v20,r10,$sp		# ABI says so
134	addi		r10,r10,32
135	mfspr		$vrsave,256
136	stvx		v21,r11,$sp
137	addi		r11,r11,32
138	stvx		v22,r10,$sp
139	addi		r10,r10,32
140	stvx		v23,r11,$sp
141	addi		r11,r11,32
142	stvx		v24,r10,$sp
143	addi		r10,r10,32
144	stvx		v25,r11,$sp
145	addi		r11,r11,32
146	stvx		v26,r10,$sp
147	addi		r10,r10,32
148	stvx		v27,r11,$sp
149	addi		r11,r11,32
150	stvx		v28,r10,$sp
151	addi		r10,r10,32
152	stvx		v29,r11,$sp
153	addi		r11,r11,32
154	stvx		v30,r10,$sp
155	stvx		v31,r11,$sp
156	li		r11,-1
157	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
158	li		$x10,0x10
159	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
160	li		$x20,0x20
161	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
162	li		$x30,0x30
163	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
164	li		$x40,0x40
165	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
166	li		$x50,0x50
167	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
168	li		$x60,0x60
169	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
170	li		$x70,0x70
171	$PUSH		$lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
172	mtspr		256,r11
173
174	bl		LPICmeup
175	addi		$offload,$sp,$FRAME+15
176___
177$code.=<<___		if ($LENDIAN);
178	li		$idx,8
179	lvsl		$lemask,0,$idx
180	vspltisb	$Ki,0x0f
181	vxor		$lemask,$lemask,$Ki
182___
183$code.=<<___		if ($SZ==4);
184	lvx_4w		$A,$x00,$ctx
185	lvx_4w		$E,$x10,$ctx
186	vsldoi		$B,$A,$A,4		# unpack
187	vsldoi		$C,$A,$A,8
188	vsldoi		$D,$A,$A,12
189	vsldoi		$F,$E,$E,4
190	vsldoi		$G,$E,$E,8
191	vsldoi		$H,$E,$E,12
192___
193$code.=<<___		if ($SZ==8);
194	lvx_u		$A,$x00,$ctx
195	lvx_u		$C,$x10,$ctx
196	lvx_u		$E,$x20,$ctx
197	vsldoi		$B,$A,$A,8		# unpack
198	lvx_u		$G,$x30,$ctx
199	vsldoi		$D,$C,$C,8
200	vsldoi		$F,$E,$E,8
201	vsldoi		$H,$G,$G,8
202___
203$code.=<<___;
204	li		r0,`($rounds-16)/16`	# inner loop counter
205	b		Loop
206.align	5
207Loop:
208	lvx		$Ki,$x00,$Tbl
209	li		$idx,16
210	lvx_u		@X[0],0,$inp
211	addi		$inp,$inp,16
212	stvx		$A,$x00,$offload	# offload $A-$H
213	stvx		$B,$x10,$offload
214	stvx		$C,$x20,$offload
215	stvx		$D,$x30,$offload
216	stvx		$E,$x40,$offload
217	stvx		$F,$x50,$offload
218	stvx		$G,$x60,$offload
219	stvx		$H,$x70,$offload
220	vaddu${sz}m	$H,$H,$Ki		# h+K[i]
221	lvx		$Ki,$idx,$Tbl
222	addi		$idx,$idx,16
223___
224for ($i=0;$i<16;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
225$code.=<<___;
226	mtctr		r0
227	b		L16_xx
228.align	5
229L16_xx:
230___
231for (;$i<32;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
232$code.=<<___;
233	bdnz		L16_xx
234
235	lvx		@X[2],$x00,$offload
236	subic.		$num,$num,1
237	lvx		@X[3],$x10,$offload
238	vaddu${sz}m	$A,$A,@X[2]
239	lvx		@X[4],$x20,$offload
240	vaddu${sz}m	$B,$B,@X[3]
241	lvx		@X[5],$x30,$offload
242	vaddu${sz}m	$C,$C,@X[4]
243	lvx		@X[6],$x40,$offload
244	vaddu${sz}m	$D,$D,@X[5]
245	lvx		@X[7],$x50,$offload
246	vaddu${sz}m	$E,$E,@X[6]
247	lvx		@X[8],$x60,$offload
248	vaddu${sz}m	$F,$F,@X[7]
249	lvx		@X[9],$x70,$offload
250	vaddu${sz}m	$G,$G,@X[8]
251	vaddu${sz}m	$H,$H,@X[9]
252	bne		Loop
253___
254$code.=<<___		if ($SZ==4);
255	lvx		@X[0],$idx,$Tbl
256	addi		$idx,$idx,16
257	vperm		$A,$A,$B,$Ki		# pack the answer
258	lvx		@X[1],$idx,$Tbl
259	vperm		$E,$E,$F,$Ki
260	vperm		$A,$A,$C,@X[0]
261	vperm		$E,$E,$G,@X[0]
262	vperm		$A,$A,$D,@X[1]
263	vperm		$E,$E,$H,@X[1]
264	stvx_4w		$A,$x00,$ctx
265	stvx_4w		$E,$x10,$ctx
266___
267$code.=<<___		if ($SZ==8);
268	vperm		$A,$A,$B,$Ki		# pack the answer
269	vperm		$C,$C,$D,$Ki
270	vperm		$E,$E,$F,$Ki
271	vperm		$G,$G,$H,$Ki
272	stvx_u		$A,$x00,$ctx
273	stvx_u		$C,$x10,$ctx
274	stvx_u		$E,$x20,$ctx
275	stvx_u		$G,$x30,$ctx
276___
277$code.=<<___;
278	li		r10,`$FRAME+8*16+15`
279	mtlr		$lrsave
280	li		r11,`$FRAME+8*16+31`
281	mtspr		256,$vrsave
282	lvx		v20,r10,$sp		# ABI says so
283	addi		r10,r10,32
284	lvx		v21,r11,$sp
285	addi		r11,r11,32
286	lvx		v22,r10,$sp
287	addi		r10,r10,32
288	lvx		v23,r11,$sp
289	addi		r11,r11,32
290	lvx		v24,r10,$sp
291	addi		r10,r10,32
292	lvx		v25,r11,$sp
293	addi		r11,r11,32
294	lvx		v26,r10,$sp
295	addi		r10,r10,32
296	lvx		v27,r11,$sp
297	addi		r11,r11,32
298	lvx		v28,r10,$sp
299	addi		r10,r10,32
300	lvx		v29,r11,$sp
301	addi		r11,r11,32
302	lvx		v30,r10,$sp
303	lvx		v31,r11,$sp
304	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
305	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
306	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
307	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
308	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
309	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
310	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
311	blr
312	.long		0
313	.byte		0,12,4,1,0x80,6,3,0
314	.long		0
315.size	$func,.-$func
316___
317
318# Ugly hack here, because PPC assembler syntax seem to vary too
319# much from platforms to platform...
320$code.=<<___;
321.align	6
322LPICmeup:
323	mflr	r0
324	bcl	20,31,\$+4
325	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
326	addi	$Tbl,$Tbl,`64-8`
327	mtlr	r0
328	blr
329	.long	0
330	.byte	0,12,0x14,0,0,0,0,0
331	.space	`64-9*4`
332___
333
334if ($SZ==8) {
335    local *table = sub {
336	foreach(@_) { $code.=".quad	$_,$_\n"; }
337    };
338    table(
339	"0x428a2f98d728ae22","0x7137449123ef65cd",
340	"0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
341	"0x3956c25bf348b538","0x59f111f1b605d019",
342	"0x923f82a4af194f9b","0xab1c5ed5da6d8118",
343	"0xd807aa98a3030242","0x12835b0145706fbe",
344	"0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
345	"0x72be5d74f27b896f","0x80deb1fe3b1696b1",
346	"0x9bdc06a725c71235","0xc19bf174cf692694",
347	"0xe49b69c19ef14ad2","0xefbe4786384f25e3",
348	"0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
349	"0x2de92c6f592b0275","0x4a7484aa6ea6e483",
350	"0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
351	"0x983e5152ee66dfab","0xa831c66d2db43210",
352	"0xb00327c898fb213f","0xbf597fc7beef0ee4",
353	"0xc6e00bf33da88fc2","0xd5a79147930aa725",
354	"0x06ca6351e003826f","0x142929670a0e6e70",
355	"0x27b70a8546d22ffc","0x2e1b21385c26c926",
356	"0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
357	"0x650a73548baf63de","0x766a0abb3c77b2a8",
358	"0x81c2c92e47edaee6","0x92722c851482353b",
359	"0xa2bfe8a14cf10364","0xa81a664bbc423001",
360	"0xc24b8b70d0f89791","0xc76c51a30654be30",
361	"0xd192e819d6ef5218","0xd69906245565a910",
362	"0xf40e35855771202a","0x106aa07032bbd1b8",
363	"0x19a4c116b8d2d0c8","0x1e376c085141ab53",
364	"0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
365	"0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
366	"0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
367	"0x748f82ee5defb2fc","0x78a5636f43172f60",
368	"0x84c87814a1f0ab72","0x8cc702081a6439ec",
369	"0x90befffa23631e28","0xa4506cebde82bde9",
370	"0xbef9a3f7b2c67915","0xc67178f2e372532b",
371	"0xca273eceea26619c","0xd186b8c721c0c207",
372	"0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
373	"0x06f067aa72176fba","0x0a637dc5a2c898a6",
374	"0x113f9804bef90dae","0x1b710b35131c471b",
375	"0x28db77f523047d84","0x32caab7b40c72493",
376	"0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
377	"0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
378	"0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
379$code.=<<___	if (!$LENDIAN);
380.quad	0x0001020304050607,0x1011121314151617
381___
382$code.=<<___	if ($LENDIAN);	# quad-swapped
383.quad	0x1011121314151617,0x0001020304050607
384___
385} else {
386    local *table = sub {
387	foreach(@_) { $code.=".long	$_,$_,$_,$_\n"; }
388    };
389    table(
390	"0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
391	"0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
392	"0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
393	"0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
394	"0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
395	"0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
396	"0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
397	"0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
398	"0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
399	"0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
400	"0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
401	"0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
402	"0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
403	"0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
404	"0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
405	"0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
406$code.=<<___	if (!$LENDIAN);
407.long	0x00010203,0x10111213,0x10111213,0x10111213
408.long	0x00010203,0x04050607,0x10111213,0x10111213
409.long	0x00010203,0x04050607,0x08090a0b,0x10111213
410___
411$code.=<<___	if ($LENDIAN);	# word-swapped
412.long	0x10111213,0x10111213,0x10111213,0x00010203
413.long	0x10111213,0x10111213,0x04050607,0x00010203
414.long	0x10111213,0x08090a0b,0x04050607,0x00010203
415___
416}
417$code.=<<___;
418.asciz	"SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
419.align	2
420___
421
422$code =~ s/\`([^\`]*)\`/eval $1/gem;
423print $code;
424close STDOUT;
425