1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# SHA1 block procedure for s390x.
18
19# April 2007.
20#
21# Performance is >30% better than gcc 3.3 generated code. But the real
22# twist is that SHA1 hardware support is detected and utilized. In
23# which case performance can reach further >4.5x for larger chunks.
24
25# January 2009.
26#
27# Optimize Xupdate for amount of memory references and reschedule
28# instructions to favour dual-issue z10 pipeline. On z10 hardware is
29# "only" ~2.3x faster than software.
30
31# November 2010.
32#
33# Adapt for -m31 build. If kernel supports what's called "highgprs"
34# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
35# instructions and achieve "64-bit" performance even in 31-bit legacy
36# application context. The feature is not specific to any particular
37# processor, as long as it's "z-CPU". Latter implies that the code
38# remains z/Architecture specific. On z990 it was measured to perform
39# 23% better than code generated by gcc 4.3.
40
41$kimdfunc=1;	# magic function code for kimd instruction
42
43$flavour = shift;
44
45if ($flavour =~ /3[12]/) {
46	$SIZE_T=4;
47	$g="";
48} else {
49	$SIZE_T=8;
50	$g="g";
51}
52
53while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
54open STDOUT,">$output";
55
56$K_00_39="%r0"; $K=$K_00_39;
57$K_40_79="%r1";
58$ctx="%r2";	$prefetch="%r2";
59$inp="%r3";
60$len="%r4";
61
62$A="%r5";
63$B="%r6";
64$C="%r7";
65$D="%r8";
66$E="%r9";	@V=($A,$B,$C,$D,$E);
67$t0="%r10";
68$t1="%r11";
69@X=("%r12","%r13","%r14");
70$sp="%r15";
71
72$stdframe=16*$SIZE_T+4*8;
73$frame=$stdframe+16*4;
74
75sub Xupdate {
76my $i=shift;
77
78$code.=<<___ if ($i==15);
79	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
80	lr	$X[0],$X[2]
81___
82return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
83$code.=<<___ if ($i<16);
84	lg	$X[0],`$i*4`($inp)	### Xload($i)
85	rllg	$X[1],$X[0],32
86___
87$code.=<<___ if ($i>=16);
88	xgr	$X[0],$prefetch		### Xupdate($i)
89	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
90	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
91	xgr	$X[0],$prefetch
92	rll	$X[0],$X[0],1
93	rllg	$X[1],$X[0],32
94	rll	$X[1],$X[1],1
95	rllg	$X[0],$X[1],32
96	lr	$X[2],$X[1]		# feedback
97___
98$code.=<<___ if ($i<=70);
99	stg	$X[0],`$stdframe+4*($i%16)`($sp)
100___
101unshift(@X,pop(@X));
102}
103
104sub BODY_00_19 {
105my ($i,$a,$b,$c,$d,$e)=@_;
106my $xi=$X[1];
107
108	&Xupdate($i);
109$code.=<<___;
110	alr	$e,$K		### $i
111	rll	$t1,$a,5
112	lr	$t0,$d
113	xr	$t0,$c
114	alr	$e,$t1
115	nr	$t0,$b
116	alr	$e,$xi
117	xr	$t0,$d
118	rll	$b,$b,30
119	alr	$e,$t0
120___
121}
122
123sub BODY_20_39 {
124my ($i,$a,$b,$c,$d,$e)=@_;
125my $xi=$X[1];
126
127	&Xupdate($i);
128$code.=<<___;
129	alr	$e,$K		### $i
130	rll	$t1,$a,5
131	lr	$t0,$b
132	alr	$e,$t1
133	xr	$t0,$c
134	alr	$e,$xi
135	xr	$t0,$d
136	rll	$b,$b,30
137	alr	$e,$t0
138___
139}
140
141sub BODY_40_59 {
142my ($i,$a,$b,$c,$d,$e)=@_;
143my $xi=$X[1];
144
145	&Xupdate($i);
146$code.=<<___;
147	alr	$e,$K		### $i
148	rll	$t1,$a,5
149	lr	$t0,$b
150	alr	$e,$t1
151	or	$t0,$c
152	lr	$t1,$b
153	nr	$t0,$d
154	nr	$t1,$c
155	alr	$e,$xi
156	or	$t0,$t1
157	rll	$b,$b,30
158	alr	$e,$t0
159___
160}
161
162$code.=<<___;
163#include "s390x_arch.h"
164
165.text
166.align	64
167.type	Ktable,\@object
168Ktable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
169	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
170.size	Ktable,.-Ktable
171.globl	sha1_block_data_order
172.type	sha1_block_data_order,\@function
173sha1_block_data_order:
174___
175$code.=<<___ if ($kimdfunc);
176	larl	%r1,OPENSSL_s390xcap_P
177	lg	%r0,S390X_KIMD(%r1)	# check kimd capabilities
178	tmhh	%r0,`0x8000>>$kimdfunc`
179	jz	.Lsoftware
180	lghi	%r0,$kimdfunc
181	lgr	%r1,$ctx
182	lgr	%r2,$inp
183	sllg	%r3,$len,6
184	.long	0xb93e0002	# kimd %r0,%r2
185	brc	1,.-4		# pay attention to "partial completion"
186	br	%r14
187.align	16
188.Lsoftware:
189___
190$code.=<<___;
191	lghi	%r1,-$frame
192	st${g}	$ctx,`2*$SIZE_T`($sp)
193	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
194	lgr	%r0,$sp
195	la	$sp,0(%r1,$sp)
196	st${g}	%r0,0($sp)
197
198	larl	$t0,Ktable
199	llgf	$A,0($ctx)
200	llgf	$B,4($ctx)
201	llgf	$C,8($ctx)
202	llgf	$D,12($ctx)
203	llgf	$E,16($ctx)
204
205	lg	$K_00_39,0($t0)
206	lg	$K_40_79,8($t0)
207
208.Lloop:
209	rllg	$K_00_39,$K_00_39,32
210___
211for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___;
213	rllg	$K_00_39,$K_00_39,32
214___
215for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
216$code.=<<___;	$K=$K_40_79;
217	rllg	$K_40_79,$K_40_79,32
218___
219for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
220$code.=<<___;
221	rllg	$K_40_79,$K_40_79,32
222___
223for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
224$code.=<<___;
225
226	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
227	la	$inp,64($inp)
228	al	$A,0($ctx)
229	al	$B,4($ctx)
230	al	$C,8($ctx)
231	al	$D,12($ctx)
232	al	$E,16($ctx)
233	st	$A,0($ctx)
234	st	$B,4($ctx)
235	st	$C,8($ctx)
236	st	$D,12($ctx)
237	st	$E,16($ctx)
238	brct${g} $len,.Lloop
239
240	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
241	br	%r14
242.size	sha1_block_data_order,.-sha1_block_data_order
243.string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
244___
245
246$code =~ s/\`([^\`]*)\`/eval $1/gem;
247
248print $code;
249close STDOUT;
250