1#! /usr/bin/env perl
2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# AES for s390x.
18
19# April 2007.
20#
21# Software performance improvement over gcc-generated code is ~70% and
22# in absolute terms is ~73 cycles per byte processed with 128-bit key.
23# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
24# *strictly* in-order execution and issued instruction [in this case
25# load value from memory is critical] has to complete before execution
26# flow proceeds. S-boxes are compressed to 2KB[+256B].
27#
28# As for hardware acceleration support. It's basically a "teaser," as
29# it can and should be improved in several ways. Most notably support
30# for CBC is not utilized, nor multiple blocks are ever processed.
31# Then software key schedule can be postponed till hardware support
32# detection... Performance improvement over assembler is reportedly
33# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
34# support is implemented.
35
36# May 2007.
37#
38# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
39# for 128-bit keys, if hardware support is detected.
40
41# January 2009.
42#
43# Add support for hardware AES192/256 and reschedule instructions to
44# minimize/avoid Address Generation Interlock hazard and to favour
45# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
46# almost 50% on z9. The gain is smaller on z10, because being dual-
47# issue z10 makes it impossible to eliminate the interlock condition:
48# critical path is not long enough. Yet it spends ~24 cycles per byte
49# processed with 128-bit key.
50#
51# Unlike previous version hardware support detection takes place only
52# at the moment of key schedule setup, which is denoted in key->rounds.
53# This is done, because deferred key setup can't be made MT-safe, not
54# for keys longer than 128 bits.
55#
56# Add AES_cbc_encrypt, which gives incredible performance improvement,
57# it was measured to be ~6.6x. It's less than previously mentioned 8x,
58# because software implementation was optimized.
59
60# May 2010.
61#
62# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
63# performance improvement over "generic" counter mode routine relying
64# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
65# to the fact that exact throughput value depends on current stack
66# frame alignment within 4KB page. In worst case you get ~75% of the
67# maximum, but *on average* it would be as much as ~98%. Meaning that
68# worst case is unlike, it's like hitting ravine on plateau.
69
70# November 2010.
71#
72# Adapt for -m31 build. If kernel supports what's called "highgprs"
73# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
74# instructions and achieve "64-bit" performance even in 31-bit legacy
75# application context. The feature is not specific to any particular
76# processor, as long as it's "z-CPU". Latter implies that the code
77# remains z/Architecture specific. On z990 it was measured to perform
78# 2x better than code generated by gcc 4.3.
79
80# December 2010.
81#
82# Add support for z196 "cipher message with counter" instruction.
83# Note however that it's disengaged, because it was measured to
84# perform ~12% worse than vanilla km-based code...
85
86# February 2011.
87#
88# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
89# instructions, which deliver ~70% improvement at 8KB block size over
90# vanilla km-based code, 37% - at most like 512-bytes block size.
91
92$flavour = shift;
93
94if ($flavour =~ /3[12]/) {
95	$SIZE_T=4;
96	$g="";
97} else {
98	$SIZE_T=8;
99	$g="g";
100}
101
102while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
103open STDOUT,">$output";
104
105$softonly=0;	# allow hardware support
106
107$t0="%r0";	$mask="%r0";
108$t1="%r1";
109$t2="%r2";	$inp="%r2";
110$t3="%r3";	$out="%r3";	$bits="%r3";
111$key="%r4";
112$i1="%r5";
113$i2="%r6";
114$i3="%r7";
115$s0="%r8";
116$s1="%r9";
117$s2="%r10";
118$s3="%r11";
119$tbl="%r12";
120$rounds="%r13";
121$ra="%r14";
122$sp="%r15";
123
124$stdframe=16*$SIZE_T+4*8;
125
126sub _data_word()
127{ my $i;
128    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
129}
130
131$code=<<___;
132#include "s390x_arch.h"
133
134.text
135
136.type	AES_Te,\@object
137.align	256
138AES_Te:
139___
140&_data_word(
141	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
142	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
143	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
144	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
145	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
146	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
147	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
148	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
149	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
150	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
151	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
152	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
153	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
154	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
155	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
156	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
157	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
158	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
159	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
160	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
161	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
162	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
163	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
164	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
165	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
166	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
167	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
168	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
169	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
170	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
171	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
172	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
173	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
174	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
175	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
176	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
177	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
178	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
179	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
180	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
181	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
182	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
183	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
184	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
185	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
186	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
187	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
188	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
189	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
190	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
191	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
192	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
193	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
194	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
195	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
196	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
197	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
198	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
199	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
200	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
201	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
202	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
203	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
204	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
205$code.=<<___;
206# Te4[256]
207.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
208.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
209.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
210.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
211.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
212.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
213.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
214.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
215.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
216.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
217.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
218.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
219.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
220.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
221.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
222.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
223.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
224.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
225.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
226.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
227.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
228.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
229.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
230.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
231.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
232.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
233.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
234.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
235.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
236.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
237.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
238.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
239# rcon[]
240.long	0x01000000, 0x02000000, 0x04000000, 0x08000000
241.long	0x10000000, 0x20000000, 0x40000000, 0x80000000
242.long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
243.align	256
244.size	AES_Te,.-AES_Te
245
246# void AES_encrypt(const unsigned char *inp, unsigned char *out,
247# 		 const AES_KEY *key) {
248.globl	AES_encrypt
249.type	AES_encrypt,\@function
250AES_encrypt:
251___
252$code.=<<___ if (!$softonly);
253	l	%r0,240($key)
254	lhi	%r1,16
255	clr	%r0,%r1
256	jl	.Lesoft
257
258	la	%r1,0($key)
259	#la	%r2,0($inp)
260	la	%r4,0($out)
261	lghi	%r3,16		# single block length
262	.long	0xb92e0042	# km %r4,%r2
263	brc	1,.-4		# can this happen?
264	br	%r14
265.align	64
266.Lesoft:
267___
268$code.=<<___;
269	stm${g}	%r3,$ra,3*$SIZE_T($sp)
270
271	llgf	$s0,0($inp)
272	llgf	$s1,4($inp)
273	llgf	$s2,8($inp)
274	llgf	$s3,12($inp)
275
276	larl	$tbl,AES_Te
277	bras	$ra,_s390x_AES_encrypt
278
279	l${g}	$out,3*$SIZE_T($sp)
280	st	$s0,0($out)
281	st	$s1,4($out)
282	st	$s2,8($out)
283	st	$s3,12($out)
284
285	lm${g}	%r6,$ra,6*$SIZE_T($sp)
286	br	$ra
287.size	AES_encrypt,.-AES_encrypt
288
289.type   _s390x_AES_encrypt,\@function
290.align	16
291_s390x_AES_encrypt:
292	st${g}	$ra,15*$SIZE_T($sp)
293	x	$s0,0($key)
294	x	$s1,4($key)
295	x	$s2,8($key)
296	x	$s3,12($key)
297	l	$rounds,240($key)
298	llill	$mask,`0xff<<3`
299	aghi	$rounds,-1
300	j	.Lenc_loop
301.align	16
302.Lenc_loop:
303	sllg	$t1,$s0,`0+3`
304	srlg	$t2,$s0,`8-3`
305	srlg	$t3,$s0,`16-3`
306	srl	$s0,`24-3`
307	nr	$s0,$mask
308	ngr	$t1,$mask
309	nr	$t2,$mask
310	nr	$t3,$mask
311
312	srlg	$i1,$s1,`16-3`	# i0
313	sllg	$i2,$s1,`0+3`
314	srlg	$i3,$s1,`8-3`
315	srl	$s1,`24-3`
316	nr	$i1,$mask
317	nr	$s1,$mask
318	ngr	$i2,$mask
319	nr	$i3,$mask
320
321	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
322	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
323	l	$t2,2($t2,$tbl) # Te2[s0>>8]
324	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
325
326	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
327	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
328	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
329	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
330
331	srlg	$i1,$s2,`8-3`	# i0
332	srlg	$i2,$s2,`16-3`	# i1
333	nr	$i1,$mask
334	nr	$i2,$mask
335	sllg	$i3,$s2,`0+3`
336	srl	$s2,`24-3`
337	nr	$s2,$mask
338	ngr	$i3,$mask
339
340	xr	$s1,$t1
341	srlg	$ra,$s3,`8-3`	# i1
342	sllg	$t1,$s3,`0+3`	# i0
343	nr	$ra,$mask
344	la	$key,16($key)
345	ngr	$t1,$mask
346
347	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
348	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
349	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
350	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
351
352	srlg	$i3,$s3,`16-3`	# i2
353	xr	$s2,$t2
354	srl	$s3,`24-3`
355	nr	$i3,$mask
356	nr	$s3,$mask
357
358	x	$s0,0($key)
359	x	$s1,4($key)
360	x	$s2,8($key)
361	x	$t3,12($key)
362
363	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
364	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
365	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
366	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
367	xr	$s3,$t3
368
369	brct	$rounds,.Lenc_loop
370	.align	16
371
372	sllg	$t1,$s0,`0+3`
373	srlg	$t2,$s0,`8-3`
374	ngr	$t1,$mask
375	srlg	$t3,$s0,`16-3`
376	srl	$s0,`24-3`
377	nr	$s0,$mask
378	nr	$t2,$mask
379	nr	$t3,$mask
380
381	srlg	$i1,$s1,`16-3`	# i0
382	sllg	$i2,$s1,`0+3`
383	ngr	$i2,$mask
384	srlg	$i3,$s1,`8-3`
385	srl	$s1,`24-3`
386	nr	$i1,$mask
387	nr	$s1,$mask
388	nr	$i3,$mask
389
390	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
391	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
392	sll	$s0,24
393	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
394	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
395	sll	$t2,8
396	sll	$t3,16
397
398	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
399	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
400	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
401	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
402	sll	$i1,16
403	sll	$s1,24
404	sll	$i3,8
405	or	$s0,$i1
406	or	$s1,$t1
407	or	$t2,$i2
408	or	$t3,$i3
409
410	srlg	$i1,$s2,`8-3`	# i0
411	srlg	$i2,$s2,`16-3`	# i1
412	nr	$i1,$mask
413	nr	$i2,$mask
414	sllg	$i3,$s2,`0+3`
415	srl	$s2,`24-3`
416	ngr	$i3,$mask
417	nr	$s2,$mask
418
419	sllg	$t1,$s3,`0+3`	# i0
420	srlg	$ra,$s3,`8-3`	# i1
421	ngr	$t1,$mask
422
423	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
424	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
425	sll	$i1,8
426	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
427	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
428	sll	$i2,16
429	nr	$ra,$mask
430	sll	$s2,24
431	or	$s0,$i1
432	or	$s1,$i2
433	or	$s2,$t2
434	or	$t3,$i3
435
436	srlg	$i3,$s3,`16-3`	# i2
437	srl	$s3,`24-3`
438	nr	$i3,$mask
439	nr	$s3,$mask
440
441	l	$t0,16($key)
442	l	$t2,20($key)
443
444	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
445	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
446	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
447	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
448	sll	$i2,8
449	sll	$i3,16
450	sll	$s3,24
451	or	$s0,$i1
452	or	$s1,$i2
453	or	$s2,$i3
454	or	$s3,$t3
455
456	l${g}	$ra,15*$SIZE_T($sp)
457	xr	$s0,$t0
458	xr	$s1,$t2
459	x	$s2,24($key)
460	x	$s3,28($key)
461
462	br	$ra
463.size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
464___
465
466$code.=<<___;
467.type	AES_Td,\@object
468.align	256
469AES_Td:
470___
471&_data_word(
472	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
473	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
474	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
475	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
476	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
477	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
478	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
479	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
480	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
481	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
482	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
483	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
484	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
485	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
486	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
487	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
488	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
489	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
490	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
491	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
492	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
493	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
494	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
495	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
496	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
497	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
498	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
499	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
500	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
501	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
502	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
503	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
504	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
505	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
506	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
507	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
508	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
509	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
510	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
511	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
512	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
513	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
514	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
515	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
516	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
517	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
518	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
519	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
520	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
521	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
522	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
523	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
524	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
525	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
526	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
527	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
528	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
529	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
530	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
531	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
532	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
533	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
534	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
535	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
536$code.=<<___;
537# Td4[256]
538.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
539.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
540.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
541.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
542.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
543.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
544.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
545.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
546.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
547.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
548.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
549.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
550.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
551.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
552.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
553.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
554.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
555.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
556.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
557.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
558.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
559.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
560.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
561.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
562.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
563.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
564.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
565.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
566.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
567.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
568.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
569.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
570.size	AES_Td,.-AES_Td
571
572# void AES_decrypt(const unsigned char *inp, unsigned char *out,
573# 		 const AES_KEY *key) {
574.globl	AES_decrypt
575.type	AES_decrypt,\@function
576AES_decrypt:
577___
578$code.=<<___ if (!$softonly);
579	l	%r0,240($key)
580	lhi	%r1,16
581	clr	%r0,%r1
582	jl	.Ldsoft
583
584	la	%r1,0($key)
585	#la	%r2,0($inp)
586	la	%r4,0($out)
587	lghi	%r3,16		# single block length
588	.long	0xb92e0042	# km %r4,%r2
589	brc	1,.-4		# can this happen?
590	br	%r14
591.align	64
592.Ldsoft:
593___
594$code.=<<___;
595	stm${g}	%r3,$ra,3*$SIZE_T($sp)
596
597	llgf	$s0,0($inp)
598	llgf	$s1,4($inp)
599	llgf	$s2,8($inp)
600	llgf	$s3,12($inp)
601
602	larl	$tbl,AES_Td
603	bras	$ra,_s390x_AES_decrypt
604
605	l${g}	$out,3*$SIZE_T($sp)
606	st	$s0,0($out)
607	st	$s1,4($out)
608	st	$s2,8($out)
609	st	$s3,12($out)
610
611	lm${g}	%r6,$ra,6*$SIZE_T($sp)
612	br	$ra
613.size	AES_decrypt,.-AES_decrypt
614
615.type   _s390x_AES_decrypt,\@function
616.align	16
617_s390x_AES_decrypt:
618	st${g}	$ra,15*$SIZE_T($sp)
619	x	$s0,0($key)
620	x	$s1,4($key)
621	x	$s2,8($key)
622	x	$s3,12($key)
623	l	$rounds,240($key)
624	llill	$mask,`0xff<<3`
625	aghi	$rounds,-1
626	j	.Ldec_loop
627.align	16
628.Ldec_loop:
629	srlg	$t1,$s0,`16-3`
630	srlg	$t2,$s0,`8-3`
631	sllg	$t3,$s0,`0+3`
632	srl	$s0,`24-3`
633	nr	$s0,$mask
634	nr	$t1,$mask
635	nr	$t2,$mask
636	ngr	$t3,$mask
637
638	sllg	$i1,$s1,`0+3`	# i0
639	srlg	$i2,$s1,`16-3`
640	srlg	$i3,$s1,`8-3`
641	srl	$s1,`24-3`
642	ngr	$i1,$mask
643	nr	$s1,$mask
644	nr	$i2,$mask
645	nr	$i3,$mask
646
647	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
648	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
649	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
650	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
651
652	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
653	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
654	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
655	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
656
657	srlg	$i1,$s2,`8-3`	# i0
658	sllg	$i2,$s2,`0+3`	# i1
659	srlg	$i3,$s2,`16-3`
660	srl	$s2,`24-3`
661	nr	$i1,$mask
662	ngr	$i2,$mask
663	nr	$s2,$mask
664	nr	$i3,$mask
665
666	xr	$s1,$t1
667	srlg	$ra,$s3,`8-3`	# i1
668	srlg	$t1,$s3,`16-3`	# i0
669	nr	$ra,$mask
670	la	$key,16($key)
671	nr	$t1,$mask
672
673	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
674	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
675	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
676	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
677
678	sllg	$i3,$s3,`0+3`	# i2
679	srl	$s3,`24-3`
680	ngr	$i3,$mask
681	nr	$s3,$mask
682
683	xr	$s2,$t2
684	x	$s0,0($key)
685	x	$s1,4($key)
686	x	$s2,8($key)
687	x	$t3,12($key)
688
689	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
690	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
691	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
692	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
693	xr	$s3,$t3
694
695	brct	$rounds,.Ldec_loop
696	.align	16
697
698	l	$t1,`2048+0`($tbl)	# prefetch Td4
699	l	$t2,`2048+64`($tbl)
700	l	$t3,`2048+128`($tbl)
701	l	$i1,`2048+192`($tbl)
702	llill	$mask,0xff
703
704	srlg	$i3,$s0,24	# i0
705	srlg	$t1,$s0,16
706	srlg	$t2,$s0,8
707	nr	$s0,$mask	# i3
708	nr	$t1,$mask
709
710	srlg	$i1,$s1,24
711	nr	$t2,$mask
712	srlg	$i2,$s1,16
713	srlg	$ra,$s1,8
714	nr	$s1,$mask	# i0
715	nr	$i2,$mask
716	nr	$ra,$mask
717
718	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
719	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
720	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
721	sll	$t1,16
722	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
723	sllg	$s0,$i3,24
724	sll	$t2,8
725
726	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
727	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
728	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
729	sll	$i1,24
730	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
731	sll	$i2,16
732	sll	$i3,8
733	or	$s0,$s1
734	or	$t1,$i1
735	or	$t2,$i2
736	or	$t3,$i3
737
738	srlg	$i1,$s2,8	# i0
739	srlg	$i2,$s2,24
740	srlg	$i3,$s2,16
741	nr	$s2,$mask	# i1
742	nr	$i1,$mask
743	nr	$i3,$mask
744	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
745	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
746	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
747	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
748	sll	$i1,8
749	sll	$i2,24
750	or	$s0,$i1
751	sll	$i3,16
752	or	$t2,$i2
753	or	$t3,$i3
754
755	srlg	$i1,$s3,16	# i0
756	srlg	$i2,$s3,8	# i1
757	srlg	$i3,$s3,24
758	nr	$s3,$mask	# i2
759	nr	$i1,$mask
760	nr	$i2,$mask
761
762	l${g}	$ra,15*$SIZE_T($sp)
763	or	$s1,$t1
764	l	$t0,16($key)
765	l	$t1,20($key)
766
767	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
768	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
769	sll	$i1,16
770	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
771	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
772	sll	$i2,8
773	sll	$s3,24
774	or	$s0,$i1
775	or	$s1,$i2
776	or	$s2,$t2
777	or	$s3,$t3
778
779	xr	$s0,$t0
780	xr	$s1,$t1
781	x	$s2,24($key)
782	x	$s3,28($key)
783
784	br	$ra
785.size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
786___
787
788$code.=<<___;
789# void AES_set_encrypt_key(const unsigned char *in, int bits,
790# 		 AES_KEY *key) {
791.globl	AES_set_encrypt_key
792.type	AES_set_encrypt_key,\@function
793.align	16
794AES_set_encrypt_key:
795_s390x_AES_set_encrypt_key:
796	lghi	$t0,0
797	cl${g}r	$inp,$t0
798	je	.Lminus1
799	cl${g}r	$key,$t0
800	je	.Lminus1
801
802	lghi	$t0,128
803	clr	$bits,$t0
804	je	.Lproceed
805	lghi	$t0,192
806	clr	$bits,$t0
807	je	.Lproceed
808	lghi	$t0,256
809	clr	$bits,$t0
810	je	.Lproceed
811	lghi	%r2,-2
812	br	%r14
813
814.align	16
815.Lproceed:
816___
817$code.=<<___ if (!$softonly);
818	# convert bits to km(c) code, [128,192,256]->[18,19,20]
819	lhi	%r5,-128
820	lhi	%r0,18
821	ar	%r5,$bits
822	srl	%r5,6
823	ar	%r5,%r0
824
825	larl	%r1,OPENSSL_s390xcap_P
826	llihh	%r0,0x8000
827	srlg	%r0,%r0,0(%r5)
828	ng	%r0,S390X_KM(%r1)  # check availability of both km...
829	ng	%r0,S390X_KMC(%r1) # ...and kmc support for given key length
830	jz	.Lekey_internal
831
832	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
833	stmg	%r0,%r1,0($key)
834	lhi	%r0,192
835	cr	$bits,%r0
836	jl	1f
837	lg	%r1,16($inp)
838	stg	%r1,16($key)
839	je	1f
840	lg	%r1,24($inp)
841	stg	%r1,24($key)
8421:	st	$bits,236($key)	# save bits [for debugging purposes]
843	lgr	$t0,%r5
844	st	%r5,240($key)	# save km(c) code
845	lghi	%r2,0
846	br	%r14
847___
848$code.=<<___;
849.align	16
850.Lekey_internal:
851	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
852
853	larl	$tbl,AES_Te+2048
854
855	llgf	$s0,0($inp)
856	llgf	$s1,4($inp)
857	llgf	$s2,8($inp)
858	llgf	$s3,12($inp)
859	st	$s0,0($key)
860	st	$s1,4($key)
861	st	$s2,8($key)
862	st	$s3,12($key)
863	lghi	$t0,128
864	cr	$bits,$t0
865	jne	.Lnot128
866
867	llill	$mask,0xff
868	lghi	$t3,0			# i=0
869	lghi	$rounds,10
870	st	$rounds,240($key)
871
872	llgfr	$t2,$s3			# temp=rk[3]
873	srlg	$i1,$s3,8
874	srlg	$i2,$s3,16
875	srlg	$i3,$s3,24
876	nr	$t2,$mask
877	nr	$i1,$mask
878	nr	$i2,$mask
879
880.align	16
881.L128_loop:
882	la	$t2,0($t2,$tbl)
883	la	$i1,0($i1,$tbl)
884	la	$i2,0($i2,$tbl)
885	la	$i3,0($i3,$tbl)
886	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
887	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
888	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
889	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
890	x	$t2,256($t3,$tbl)	# rcon[i]
891	xr	$s0,$t2			# rk[4]=rk[0]^...
892	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
893	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
894	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
895
896	llgfr	$t2,$s3			# temp=rk[3]
897	srlg	$i1,$s3,8
898	srlg	$i2,$s3,16
899	nr	$t2,$mask
900	nr	$i1,$mask
901	srlg	$i3,$s3,24
902	nr	$i2,$mask
903
904	st	$s0,16($key)
905	st	$s1,20($key)
906	st	$s2,24($key)
907	st	$s3,28($key)
908	la	$key,16($key)		# key+=4
909	la	$t3,4($t3)		# i++
910	brct	$rounds,.L128_loop
911	lghi	$t0,10
912	lghi	%r2,0
913	lm${g}	%r4,%r13,4*$SIZE_T($sp)
914	br	$ra
915
916.align	16
917.Lnot128:
918	llgf	$t0,16($inp)
919	llgf	$t1,20($inp)
920	st	$t0,16($key)
921	st	$t1,20($key)
922	lghi	$t0,192
923	cr	$bits,$t0
924	jne	.Lnot192
925
926	llill	$mask,0xff
927	lghi	$t3,0			# i=0
928	lghi	$rounds,12
929	st	$rounds,240($key)
930	lghi	$rounds,8
931
932	srlg	$i1,$t1,8
933	srlg	$i2,$t1,16
934	srlg	$i3,$t1,24
935	nr	$t1,$mask
936	nr	$i1,$mask
937	nr	$i2,$mask
938
939.align	16
940.L192_loop:
941	la	$t1,0($t1,$tbl)
942	la	$i1,0($i1,$tbl)
943	la	$i2,0($i2,$tbl)
944	la	$i3,0($i3,$tbl)
945	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
946	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
947	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
948	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
949	x	$t1,256($t3,$tbl)	# rcon[i]
950	xr	$s0,$t1			# rk[6]=rk[0]^...
951	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
952	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
953	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
954
955	st	$s0,24($key)
956	st	$s1,28($key)
957	st	$s2,32($key)
958	st	$s3,36($key)
959	brct	$rounds,.L192_continue
960	lghi	$t0,12
961	lghi	%r2,0
962	lm${g}	%r4,%r13,4*$SIZE_T($sp)
963	br	$ra
964
965.align	16
966.L192_continue:
967	lgr	$t1,$s3
968	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
969	st	$t1,40($key)
970	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
971	st	$t1,44($key)
972
973	srlg	$i1,$t1,8
974	srlg	$i2,$t1,16
975	srlg	$i3,$t1,24
976	nr	$t1,$mask
977	nr	$i1,$mask
978	nr	$i2,$mask
979
980	la	$key,24($key)		# key+=6
981	la	$t3,4($t3)		# i++
982	j	.L192_loop
983
984.align	16
985.Lnot192:
986	llgf	$t0,24($inp)
987	llgf	$t1,28($inp)
988	st	$t0,24($key)
989	st	$t1,28($key)
990	llill	$mask,0xff
991	lghi	$t3,0			# i=0
992	lghi	$rounds,14
993	st	$rounds,240($key)
994	lghi	$rounds,7
995
996	srlg	$i1,$t1,8
997	srlg	$i2,$t1,16
998	srlg	$i3,$t1,24
999	nr	$t1,$mask
1000	nr	$i1,$mask
1001	nr	$i2,$mask
1002
1003.align	16
1004.L256_loop:
1005	la	$t1,0($t1,$tbl)
1006	la	$i1,0($i1,$tbl)
1007	la	$i2,0($i2,$tbl)
1008	la	$i3,0($i3,$tbl)
1009	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
1010	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
1011	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
1012	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
1013	x	$t1,256($t3,$tbl)	# rcon[i]
1014	xr	$s0,$t1			# rk[8]=rk[0]^...
1015	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
1016	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
1017	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
1018	st	$s0,32($key)
1019	st	$s1,36($key)
1020	st	$s2,40($key)
1021	st	$s3,44($key)
1022	brct	$rounds,.L256_continue
1023	lghi	$t0,14
1024	lghi	%r2,0
1025	lm${g}	%r4,%r13,4*$SIZE_T($sp)
1026	br	$ra
1027
1028.align	16
1029.L256_continue:
1030	lgr	$t1,$s3			# temp=rk[11]
1031	srlg	$i1,$s3,8
1032	srlg	$i2,$s3,16
1033	srlg	$i3,$s3,24
1034	nr	$t1,$mask
1035	nr	$i1,$mask
1036	nr	$i2,$mask
1037	la	$t1,0($t1,$tbl)
1038	la	$i1,0($i1,$tbl)
1039	la	$i2,0($i2,$tbl)
1040	la	$i3,0($i3,$tbl)
1041	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
1042	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
1043	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
1044	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
1045	x	$t1,16($key)		# rk[12]=rk[4]^...
1046	st	$t1,48($key)
1047	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
1048	st	$t1,52($key)
1049	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
1050	st	$t1,56($key)
1051	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
1052	st	$t1,60($key)
1053
1054	srlg	$i1,$t1,8
1055	srlg	$i2,$t1,16
1056	srlg	$i3,$t1,24
1057	nr	$t1,$mask
1058	nr	$i1,$mask
1059	nr	$i2,$mask
1060
1061	la	$key,32($key)		# key+=8
1062	la	$t3,4($t3)		# i++
1063	j	.L256_loop
1064
1065.Lminus1:
1066	lghi	%r2,-1
1067	br	$ra
1068.size	AES_set_encrypt_key,.-AES_set_encrypt_key
1069
1070# void AES_set_decrypt_key(const unsigned char *in, int bits,
1071# 		 AES_KEY *key) {
1072.globl	AES_set_decrypt_key
1073.type	AES_set_decrypt_key,\@function
1074.align	16
1075AES_set_decrypt_key:
1076	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
1077	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
1078	bras	$ra,_s390x_AES_set_encrypt_key
1079	#l${g}	$key,4*$SIZE_T($sp)
1080	l${g}	$ra,14*$SIZE_T($sp)
1081	ltgr	%r2,%r2
1082	bnzr	$ra
1083___
1084$code.=<<___ if (!$softonly);
1085	#l	$t0,240($key)
1086	lhi	$t1,16
1087	cr	$t0,$t1
1088	jl	.Lgo
1089	oill	$t0,S390X_DECRYPT	# set "decrypt" bit
1090	st	$t0,240($key)
1091	br	$ra
1092___
1093$code.=<<___;
1094.align	16
1095.Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
1096	la	$i1,0($key)
1097	sllg	$i2,$rounds,4
1098	la	$i2,0($i2,$key)
1099	srl	$rounds,1
1100	lghi	$t1,-16
1101
1102.align	16
1103.Linv:	lmg	$s0,$s1,0($i1)
1104	lmg	$s2,$s3,0($i2)
1105	stmg	$s0,$s1,0($i2)
1106	stmg	$s2,$s3,0($i1)
1107	la	$i1,16($i1)
1108	la	$i2,0($t1,$i2)
1109	brct	$rounds,.Linv
1110___
1111$mask80=$i1;
1112$mask1b=$i2;
1113$maskfe=$i3;
1114$code.=<<___;
1115	llgf	$rounds,240($key)
1116	aghi	$rounds,-1
1117	sll	$rounds,2	# (rounds-1)*4
1118	llilh	$mask80,0x8080
1119	llilh	$mask1b,0x1b1b
1120	llilh	$maskfe,0xfefe
1121	oill	$mask80,0x8080
1122	oill	$mask1b,0x1b1b
1123	oill	$maskfe,0xfefe
1124
1125.align	16
1126.Lmix:	l	$s0,16($key)	# tp1
1127	lr	$s1,$s0
1128	ngr	$s1,$mask80
1129	srlg	$t1,$s1,7
1130	slr	$s1,$t1
1131	nr	$s1,$mask1b
1132	sllg	$t1,$s0,1
1133	nr	$t1,$maskfe
1134	xr	$s1,$t1		# tp2
1135
1136	lr	$s2,$s1
1137	ngr	$s2,$mask80
1138	srlg	$t1,$s2,7
1139	slr	$s2,$t1
1140	nr	$s2,$mask1b
1141	sllg	$t1,$s1,1
1142	nr	$t1,$maskfe
1143	xr	$s2,$t1		# tp4
1144
1145	lr	$s3,$s2
1146	ngr	$s3,$mask80
1147	srlg	$t1,$s3,7
1148	slr	$s3,$t1
1149	nr	$s3,$mask1b
1150	sllg	$t1,$s2,1
1151	nr	$t1,$maskfe
1152	xr	$s3,$t1		# tp8
1153
1154	xr	$s1,$s0		# tp2^tp1
1155	xr	$s2,$s0		# tp4^tp1
1156	rll	$s0,$s0,24	# = ROTATE(tp1,8)
1157	xr	$s2,$s3		# ^=tp8
1158	xr	$s0,$s1		# ^=tp2^tp1
1159	xr	$s1,$s3		# tp2^tp1^tp8
1160	xr	$s0,$s2		# ^=tp4^tp1^tp8
1161	rll	$s1,$s1,8
1162	rll	$s2,$s2,16
1163	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
1164	rll	$s3,$s3,24
1165	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
1166	xr	$s0,$s3		# ^= ROTATE(tp8,8)
1167
1168	st	$s0,16($key)
1169	la	$key,4($key)
1170	brct	$rounds,.Lmix
1171
1172	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1173	lghi	%r2,0
1174	br	$ra
1175.size	AES_set_decrypt_key,.-AES_set_decrypt_key
1176___
1177
1178########################################################################
1179# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1180#                     size_t length, const AES_KEY *key,
1181#                     unsigned char *ivec, const int enc)
1182{
1183my $inp="%r2";
1184my $out="%r4";	# length and out are swapped
1185my $len="%r3";
1186my $key="%r5";
1187my $ivp="%r6";
1188
1189$code.=<<___;
1190.globl	AES_cbc_encrypt
1191.type	AES_cbc_encrypt,\@function
1192.align	16
1193AES_cbc_encrypt:
1194	xgr	%r3,%r4		# flip %r3 and %r4, out and len
1195	xgr	%r4,%r3
1196	xgr	%r3,%r4
1197___
1198$code.=<<___ if (!$softonly);
1199	lhi	%r0,16
1200	cl	%r0,240($key)
1201	jh	.Lcbc_software
1202
1203	lg	%r0,0($ivp)	# copy ivec
1204	lg	%r1,8($ivp)
1205	stmg	%r0,%r1,16($sp)
1206	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
1207	stmg	%r0,%r1,32($sp)
1208	lmg	%r0,%r1,16($key)
1209	stmg	%r0,%r1,48($sp)
1210	l	%r0,240($key)	# load kmc code
1211	lghi	$key,15		# res=len%16, len-=res;
1212	ngr	$key,$len
1213	sl${g}r	$len,$key
1214	la	%r1,16($sp)	# parameter block - ivec || key
1215	jz	.Lkmc_truncated
1216	.long	0xb92f0042	# kmc %r4,%r2
1217	brc	1,.-4		# pay attention to "partial completion"
1218	ltr	$key,$key
1219	jnz	.Lkmc_truncated
1220.Lkmc_done:
1221	lmg	%r0,%r1,16($sp)	# copy ivec to caller
1222	stg	%r0,0($ivp)
1223	stg	%r1,8($ivp)
1224	br	$ra
1225.align	16
1226.Lkmc_truncated:
1227	ahi	$key,-1		# it's the way it's encoded in mvc
1228	tmll	%r0,S390X_DECRYPT
1229	jnz	.Lkmc_truncated_dec
1230	lghi	%r1,0
1231	stg	%r1,16*$SIZE_T($sp)
1232	stg	%r1,16*$SIZE_T+8($sp)
1233	bras	%r1,1f
1234	mvc	16*$SIZE_T(1,$sp),0($inp)
12351:	ex	$key,0(%r1)
1236	la	%r1,16($sp)	# restore parameter block
1237	la	$inp,16*$SIZE_T($sp)
1238	lghi	$len,16
1239	.long	0xb92f0042	# kmc %r4,%r2
1240	j	.Lkmc_done
1241.align	16
1242.Lkmc_truncated_dec:
1243	st${g}	$out,4*$SIZE_T($sp)
1244	la	$out,16*$SIZE_T($sp)
1245	lghi	$len,16
1246	.long	0xb92f0042	# kmc %r4,%r2
1247	l${g}	$out,4*$SIZE_T($sp)
1248	bras	%r1,2f
1249	mvc	0(1,$out),16*$SIZE_T($sp)
12502:	ex	$key,0(%r1)
1251	j	.Lkmc_done
1252.align	16
1253.Lcbc_software:
1254___
1255$code.=<<___;
1256	stm${g}	$key,$ra,5*$SIZE_T($sp)
1257	lhi	%r0,0
1258	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
1259	je	.Lcbc_decrypt
1260
1261	larl	$tbl,AES_Te
1262
1263	llgf	$s0,0($ivp)
1264	llgf	$s1,4($ivp)
1265	llgf	$s2,8($ivp)
1266	llgf	$s3,12($ivp)
1267
1268	lghi	$t0,16
1269	sl${g}r	$len,$t0
1270	brc	4,.Lcbc_enc_tail	# if borrow
1271.Lcbc_enc_loop:
1272	stm${g}	$inp,$out,2*$SIZE_T($sp)
1273	x	$s0,0($inp)
1274	x	$s1,4($inp)
1275	x	$s2,8($inp)
1276	x	$s3,12($inp)
1277	lgr	%r4,$key
1278
1279	bras	$ra,_s390x_AES_encrypt
1280
1281	lm${g}	$inp,$key,2*$SIZE_T($sp)
1282	st	$s0,0($out)
1283	st	$s1,4($out)
1284	st	$s2,8($out)
1285	st	$s3,12($out)
1286
1287	la	$inp,16($inp)
1288	la	$out,16($out)
1289	lghi	$t0,16
1290	lt${g}r	$len,$len
1291	jz	.Lcbc_enc_done
1292	sl${g}r	$len,$t0
1293	brc	4,.Lcbc_enc_tail	# if borrow
1294	j	.Lcbc_enc_loop
1295.align	16
1296.Lcbc_enc_done:
1297	l${g}	$ivp,6*$SIZE_T($sp)
1298	st	$s0,0($ivp)
1299	st	$s1,4($ivp)
1300	st	$s2,8($ivp)
1301	st	$s3,12($ivp)
1302
1303	lm${g}	%r7,$ra,7*$SIZE_T($sp)
1304	br	$ra
1305
1306.align	16
1307.Lcbc_enc_tail:
1308	aghi	$len,15
1309	lghi	$t0,0
1310	stg	$t0,16*$SIZE_T($sp)
1311	stg	$t0,16*$SIZE_T+8($sp)
1312	bras	$t1,3f
1313	mvc	16*$SIZE_T(1,$sp),0($inp)
13143:	ex	$len,0($t1)
1315	lghi	$len,0
1316	la	$inp,16*$SIZE_T($sp)
1317	j	.Lcbc_enc_loop
1318
1319.align	16
1320.Lcbc_decrypt:
1321	larl	$tbl,AES_Td
1322
1323	lg	$t0,0($ivp)
1324	lg	$t1,8($ivp)
1325	stmg	$t0,$t1,16*$SIZE_T($sp)
1326
1327.Lcbc_dec_loop:
1328	stm${g}	$inp,$out,2*$SIZE_T($sp)
1329	llgf	$s0,0($inp)
1330	llgf	$s1,4($inp)
1331	llgf	$s2,8($inp)
1332	llgf	$s3,12($inp)
1333	lgr	%r4,$key
1334
1335	bras	$ra,_s390x_AES_decrypt
1336
1337	lm${g}	$inp,$key,2*$SIZE_T($sp)
1338	sllg	$s0,$s0,32
1339	sllg	$s2,$s2,32
1340	lr	$s0,$s1
1341	lr	$s2,$s3
1342
1343	lg	$t0,0($inp)
1344	lg	$t1,8($inp)
1345	xg	$s0,16*$SIZE_T($sp)
1346	xg	$s2,16*$SIZE_T+8($sp)
1347	lghi	$s1,16
1348	sl${g}r	$len,$s1
1349	brc	4,.Lcbc_dec_tail	# if borrow
1350	brc	2,.Lcbc_dec_done	# if zero
1351	stg	$s0,0($out)
1352	stg	$s2,8($out)
1353	stmg	$t0,$t1,16*$SIZE_T($sp)
1354
1355	la	$inp,16($inp)
1356	la	$out,16($out)
1357	j	.Lcbc_dec_loop
1358
1359.Lcbc_dec_done:
1360	stg	$s0,0($out)
1361	stg	$s2,8($out)
1362.Lcbc_dec_exit:
1363	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1364	stmg	$t0,$t1,0($ivp)
1365
1366	br	$ra
1367
1368.align	16
1369.Lcbc_dec_tail:
1370	aghi	$len,15
1371	stg	$s0,16*$SIZE_T($sp)
1372	stg	$s2,16*$SIZE_T+8($sp)
1373	bras	$s1,4f
1374	mvc	0(1,$out),16*$SIZE_T($sp)
13754:	ex	$len,0($s1)
1376	j	.Lcbc_dec_exit
1377.size	AES_cbc_encrypt,.-AES_cbc_encrypt
1378___
1379}
1380########################################################################
1381# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1382#                     size_t blocks, const AES_KEY *key,
1383#                     const unsigned char *ivec)
1384{
1385my $inp="%r2";
1386my $out="%r4";	# blocks and out are swapped
1387my $len="%r3";
1388my $key="%r5";	my $iv0="%r5";
1389my $ivp="%r6";
1390my $fp ="%r7";
1391
1392$code.=<<___;
1393.globl	AES_ctr32_encrypt
1394.type	AES_ctr32_encrypt,\@function
1395.align	16
1396AES_ctr32_encrypt:
1397	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
1398	xgr	%r4,%r3
1399	xgr	%r3,%r4
1400	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
1401___
1402$code.=<<___ if (!$softonly);
1403	l	%r0,240($key)
1404	lhi	%r1,16
1405	clr	%r0,%r1
1406	jl	.Lctr32_software
1407
1408	st${g}	$s2,10*$SIZE_T($sp)
1409	st${g}	$s3,11*$SIZE_T($sp)
1410
1411	clr	$len,%r1		# does work even in 64-bit mode
1412	jle	.Lctr32_nokma		# kma is slower for <= 16 blocks
1413
1414	larl	%r1,OPENSSL_s390xcap_P
1415	lr	$s2,%r0
1416	llihh	$s3,0x8000
1417	srlg	$s3,$s3,0($s2)
1418	ng	$s3,S390X_KMA(%r1)		# check kma capability vector
1419	jz	.Lctr32_nokma
1420
1421	l${g}hi	%r1,-$stdframe-112
1422	l${g}r	$s3,$sp
1423	la	$sp,0(%r1,$sp)			# prepare parameter block
1424
1425	lhi	%r1,0x0600
1426	sllg	$len,$len,4
1427	or	%r0,%r1				# set HS and LAAD flags
1428
1429	st${g}	$s3,0($sp)			# backchain
1430	la	%r1,$stdframe($sp)
1431
1432	lmg	$s2,$s3,0($key)			# copy key
1433	stg	$s2,$stdframe+80($sp)
1434	stg	$s3,$stdframe+88($sp)
1435	lmg	$s2,$s3,16($key)
1436	stg	$s2,$stdframe+96($sp)
1437	stg	$s3,$stdframe+104($sp)
1438
1439	lmg	$s2,$s3,0($ivp)			# copy iv
1440	stg	$s2,$stdframe+64($sp)
1441	ahi	$s3,-1				# kma requires counter-1
1442	stg	$s3,$stdframe+72($sp)
1443	st	$s3,$stdframe+12($sp)		# copy counter
1444
1445	lghi	$s2,0				# no AAD
1446	lghi	$s3,0
1447
1448	.long	0xb929a042	# kma $out,$s2,$inp
1449	brc	1,.-4		# pay attention to "partial completion"
1450
1451	stg	%r0,$stdframe+80($sp)		# wipe key
1452	stg	%r0,$stdframe+88($sp)
1453	stg	%r0,$stdframe+96($sp)
1454	stg	%r0,$stdframe+104($sp)
1455	la	$sp,$stdframe+112($sp)
1456
1457	lm${g}	$s2,$s3,10*$SIZE_T($sp)
1458	br	$ra
1459
1460.align	16
1461.Lctr32_nokma:
1462	stm${g}	%r6,$s1,6*$SIZE_T($sp)
1463
1464	slgr	$out,$inp
1465	la	%r1,0($key)	# %r1 is permanent copy of $key
1466	lg	$iv0,0($ivp)	# load ivec
1467	lg	$ivp,8($ivp)
1468
1469	# prepare and allocate stack frame at the top of 4K page
1470	# with 1K reserved for eventual signal handling
1471	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1472	lghi	$s1,-4096
1473	algr	$s0,$sp
1474	lgr	$fp,$sp
1475	ngr	$s0,$s1		# align at page boundary
1476	slgr	$fp,$s0		# total buffer size
1477	lgr	$s2,$sp
1478	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1479	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1480	# buffer size is at lest 256 and at most 3072+256-16
1481
1482	la	$sp,1024($s0)	# alloca
1483	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
1484	st${g}	$s2,0($sp)	# back-chain
1485	st${g}	$fp,$SIZE_T($sp)
1486
1487	slgr	$len,$fp
1488	brc	1,.Lctr32_hw_switch	# not zero, no borrow
1489	algr	$fp,$len	# input is shorter than allocated buffer
1490	lghi	$len,0
1491	st${g}	$fp,$SIZE_T($sp)
1492
1493.Lctr32_hw_switch:
1494___
1495$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
1496	llgfr	$s0,%r0
1497	lgr	$s1,%r1
1498	larl	%r1,OPENSSL_s390xcap_P
1499	llihh	%r0,0x8000	# check if kmctr supports the function code
1500	srlg	%r0,%r0,0($s0)
1501	ng	%r0,S390X_KMCTR(%r1)	# check kmctr capability vector
1502	lgr	%r0,$s0
1503	lgr	%r1,$s1
1504	jz	.Lctr32_km_loop
1505
1506####### kmctr code
1507	algr	$out,$inp	# restore $out
1508	lgr	$s1,$len	# $s1 undertakes $len
1509	j	.Lctr32_kmctr_loop
1510.align	16
1511.Lctr32_kmctr_loop:
1512	la	$s2,16($sp)
1513	lgr	$s3,$fp
1514.Lctr32_kmctr_prepare:
1515	stg	$iv0,0($s2)
1516	stg	$ivp,8($s2)
1517	la	$s2,16($s2)
1518	ahi	$ivp,1		# 32-bit increment, preserves upper half
1519	brct	$s3,.Lctr32_kmctr_prepare
1520
1521	#la	$inp,0($inp)	# inp
1522	sllg	$len,$fp,4	# len
1523	#la	$out,0($out)	# out
1524	la	$s2,16($sp)	# iv
1525	.long	0xb92da042	# kmctr $out,$s2,$inp
1526	brc	1,.-4		# pay attention to "partial completion"
1527
1528	slgr	$s1,$fp
1529	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
1530	algr	$fp,$s1
1531	lghi	$s1,0
1532	brc	4+1,.Lctr32_kmctr_loop	# not zero
1533
1534	l${g}	$sp,0($sp)
1535	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1536	br	$ra
1537.align	16
1538___
1539$code.=<<___ if (!$softonly);
1540.Lctr32_km_loop:
1541	la	$s2,16($sp)
1542	lgr	$s3,$fp
1543.Lctr32_km_prepare:
1544	stg	$iv0,0($s2)
1545	stg	$ivp,8($s2)
1546	la	$s2,16($s2)
1547	ahi	$ivp,1		# 32-bit increment, preserves upper half
1548	brct	$s3,.Lctr32_km_prepare
1549
1550	la	$s0,16($sp)	# inp
1551	sllg	$s1,$fp,4	# len
1552	la	$s2,16($sp)	# out
1553	.long	0xb92e00a8	# km %r10,%r8
1554	brc	1,.-4		# pay attention to "partial completion"
1555
1556	la	$s2,16($sp)
1557	lgr	$s3,$fp
1558	slgr	$s2,$inp
1559.Lctr32_km_xor:
1560	lg	$s0,0($inp)
1561	lg	$s1,8($inp)
1562	xg	$s0,0($s2,$inp)
1563	xg	$s1,8($s2,$inp)
1564	stg	$s0,0($out,$inp)
1565	stg	$s1,8($out,$inp)
1566	la	$inp,16($inp)
1567	brct	$s3,.Lctr32_km_xor
1568
1569	slgr	$len,$fp
1570	brc	1,.Lctr32_km_loop	# not zero, no borrow
1571	algr	$fp,$len
1572	lghi	$len,0
1573	brc	4+1,.Lctr32_km_loop	# not zero
1574
1575	l${g}	$s0,0($sp)
1576	l${g}	$s1,$SIZE_T($sp)
1577	la	$s2,16($sp)
1578.Lctr32_km_zap:
1579	stg	$s0,0($s2)
1580	stg	$s0,8($s2)
1581	la	$s2,16($s2)
1582	brct	$s1,.Lctr32_km_zap
1583
1584	la	$sp,0($s0)
1585	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1586	br	$ra
1587.align	16
1588.Lctr32_software:
1589___
1590$code.=<<___;
1591	stm${g}	$key,$ra,5*$SIZE_T($sp)
1592	sl${g}r	$inp,$out
1593	larl	$tbl,AES_Te
1594	llgf	$t1,12($ivp)
1595
1596.Lctr32_loop:
1597	stm${g}	$inp,$out,2*$SIZE_T($sp)
1598	llgf	$s0,0($ivp)
1599	llgf	$s1,4($ivp)
1600	llgf	$s2,8($ivp)
1601	lgr	$s3,$t1
1602	st	$t1,16*$SIZE_T($sp)
1603	lgr	%r4,$key
1604
1605	bras	$ra,_s390x_AES_encrypt
1606
1607	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
1608	llgf	$t1,16*$SIZE_T($sp)
1609	x	$s0,0($inp,$out)
1610	x	$s1,4($inp,$out)
1611	x	$s2,8($inp,$out)
1612	x	$s3,12($inp,$out)
1613	stm	$s0,$s3,0($out)
1614
1615	la	$out,16($out)
1616	ahi	$t1,1		# 32-bit increment
1617	brct	$len,.Lctr32_loop
1618
1619	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1620	br	$ra
1621.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
1622___
1623}
1624
1625########################################################################
1626# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1627#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
1628#	const unsigned char iv[16]);
1629#
1630{
1631my $inp="%r2";
1632my $out="%r4";	# len and out are swapped
1633my $len="%r3";
1634my $key1="%r5";	# $i1
1635my $key2="%r6";	# $i2
1636my $fp="%r7";	# $i3
1637my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
1638
1639$code.=<<___;
1640.type	_s390x_xts_km,\@function
1641.align	16
1642_s390x_xts_km:
1643___
1644$code.=<<___ if(1);
1645	llgfr	$s0,%r0			# put aside the function code
1646	lghi	$s1,0x7f
1647	nr	$s1,%r0
1648	larl	%r1,OPENSSL_s390xcap_P
1649	llihh	%r0,0x8000
1650	srlg	%r0,%r0,32($s1)		# check for 32+function code
1651	ng	%r0,S390X_KM(%r1)	# check km capability vector
1652	lgr	%r0,$s0			# restore the function code
1653	la	%r1,0($key1)		# restore $key1
1654	jz	.Lxts_km_vanilla
1655
1656	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
1657	algr	$out,$inp
1658
1659	oill	%r0,32			# switch to xts function code
1660	aghi	$s1,-18			#
1661	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
1662	la	%r1,$tweak-16($sp)
1663	slgr	%r1,$s1			# parameter block position
1664	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
1665	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
1666					# yes, it contains junk and overlaps
1667					# with the tweak in 128-bit case.
1668					# it's done to avoid conditional
1669					# branch.
1670	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
1671
1672	.long	0xb92e0042		# km %r4,%r2
1673	brc	1,.-4			# pay attention to "partial completion"
1674
1675	lrvg	$s0,$tweak+0($sp)	# load the last tweak
1676	lrvg	$s1,$tweak+8($sp)
1677	stmg	%r0,%r3,$tweak-32($sp)	# wipe copy of the key
1678
1679	nill	%r0,0xffdf		# switch back to original function code
1680	la	%r1,0($key1)		# restore pointer to $key1
1681	slgr	$out,$inp
1682
1683	llgc	$len,2*$SIZE_T-1($sp)
1684	nill	$len,0x0f		# $len%=16
1685	br	$ra
1686
1687.align	16
1688.Lxts_km_vanilla:
1689___
1690$code.=<<___;
1691	# prepare and allocate stack frame at the top of 4K page
1692	# with 1K reserved for eventual signal handling
1693	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1694	lghi	$s1,-4096
1695	algr	$s0,$sp
1696	lgr	$fp,$sp
1697	ngr	$s0,$s1		# align at page boundary
1698	slgr	$fp,$s0		# total buffer size
1699	lgr	$s2,$sp
1700	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1701	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1702	# buffer size is at lest 256 and at most 3072+256-16
1703
1704	la	$sp,1024($s0)	# alloca
1705	nill	$fp,0xfff0	# round to 16*n
1706	st${g}	$s2,0($sp)	# back-chain
1707	nill	$len,0xfff0	# redundant
1708	st${g}	$fp,$SIZE_T($sp)
1709
1710	slgr	$len,$fp
1711	brc	1,.Lxts_km_go	# not zero, no borrow
1712	algr	$fp,$len	# input is shorter than allocated buffer
1713	lghi	$len,0
1714	st${g}	$fp,$SIZE_T($sp)
1715
1716.Lxts_km_go:
1717	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
1718	lrvg	$s1,$tweak+8($s2)
1719
1720	la	$s2,16($sp)		# vector of ascending tweak values
1721	slgr	$s2,$inp
1722	srlg	$s3,$fp,4
1723	j	.Lxts_km_start
1724
1725.Lxts_km_loop:
1726	la	$s2,16($sp)
1727	slgr	$s2,$inp
1728	srlg	$s3,$fp,4
1729.Lxts_km_prepare:
1730	lghi	$i1,0x87
1731	srag	$i2,$s1,63		# broadcast upper bit
1732	ngr	$i1,$i2			# rem
1733	algr	$s0,$s0
1734	alcgr	$s1,$s1
1735	xgr	$s0,$i1
1736.Lxts_km_start:
1737	lrvgr	$i1,$s0			# flip byte order
1738	lrvgr	$i2,$s1
1739	stg	$i1,0($s2,$inp)
1740	stg	$i2,8($s2,$inp)
1741	xg	$i1,0($inp)
1742	xg	$i2,8($inp)
1743	stg	$i1,0($out,$inp)
1744	stg	$i2,8($out,$inp)
1745	la	$inp,16($inp)
1746	brct	$s3,.Lxts_km_prepare
1747
1748	slgr	$inp,$fp		# rewind $inp
1749	la	$s2,0($out,$inp)
1750	lgr	$s3,$fp
1751	.long	0xb92e00aa		# km $s2,$s2
1752	brc	1,.-4			# pay attention to "partial completion"
1753
1754	la	$s2,16($sp)
1755	slgr	$s2,$inp
1756	srlg	$s3,$fp,4
1757.Lxts_km_xor:
1758	lg	$i1,0($out,$inp)
1759	lg	$i2,8($out,$inp)
1760	xg	$i1,0($s2,$inp)
1761	xg	$i2,8($s2,$inp)
1762	stg	$i1,0($out,$inp)
1763	stg	$i2,8($out,$inp)
1764	la	$inp,16($inp)
1765	brct	$s3,.Lxts_km_xor
1766
1767	slgr	$len,$fp
1768	brc	1,.Lxts_km_loop		# not zero, no borrow
1769	algr	$fp,$len
1770	lghi	$len,0
1771	brc	4+1,.Lxts_km_loop	# not zero
1772
1773	l${g}	$i1,0($sp)		# back-chain
1774	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
1775	la	$i2,16($sp)
1776	srlg	$fp,$fp,4
1777.Lxts_km_zap:
1778	stg	$i1,0($i2)
1779	stg	$i1,8($i2)
1780	la	$i2,16($i2)
1781	brct	$fp,.Lxts_km_zap
1782
1783	la	$sp,0($i1)
1784	llgc	$len,2*$SIZE_T-1($i1)
1785	nill	$len,0x0f		# $len%=16
1786	bzr	$ra
1787
1788	# generate one more tweak...
1789	lghi	$i1,0x87
1790	srag	$i2,$s1,63		# broadcast upper bit
1791	ngr	$i1,$i2			# rem
1792	algr	$s0,$s0
1793	alcgr	$s1,$s1
1794	xgr	$s0,$i1
1795
1796	ltr	$len,$len		# clear zero flag
1797	br	$ra
1798.size	_s390x_xts_km,.-_s390x_xts_km
1799
1800.globl	AES_xts_encrypt
1801.type	AES_xts_encrypt,\@function
1802.align	16
1803AES_xts_encrypt:
1804	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
1805	xgr	%r4,%r3
1806	xgr	%r3,%r4
1807___
1808$code.=<<___ if ($SIZE_T==4);
1809	llgfr	$len,$len
1810___
1811$code.=<<___;
1812	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
1813	srag	$len,$len,4		# formally wrong, because it expands
1814					# sign byte, but who can afford asking
1815					# to process more than 2^63-1 bytes?
1816					# I use it, because it sets condition
1817					# code...
1818	bcr	8,$ra			# abort if zero (i.e. less than 16)
1819___
1820$code.=<<___ if (!$softonly);
1821	llgf	%r0,240($key2)
1822	lhi	%r1,16
1823	clr	%r0,%r1
1824	jl	.Lxts_enc_software
1825
1826	st${g}	$ra,5*$SIZE_T($sp)
1827	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1828
1829	sllg	$len,$len,4		# $len&=~15
1830	slgr	$out,$inp
1831
1832	# generate the tweak value
1833	l${g}	$s3,$stdframe($sp)	# pointer to iv
1834	la	$s2,$tweak($sp)
1835	lmg	$s0,$s1,0($s3)
1836	lghi	$s3,16
1837	stmg	$s0,$s1,0($s2)
1838	la	%r1,0($key2)		# $key2 is not needed anymore
1839	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
1840	brc	1,.-4			# can this happen?
1841
1842	l	%r0,240($key1)
1843	la	%r1,0($key1)		# $key1 is not needed anymore
1844	bras	$ra,_s390x_xts_km
1845	jz	.Lxts_enc_km_done
1846
1847	aghi	$inp,-16		# take one step back
1848	la	$i3,0($out,$inp)	# put aside real $out
1849.Lxts_enc_km_steal:
1850	llgc	$i1,16($inp)
1851	llgc	$i2,0($out,$inp)
1852	stc	$i1,0($out,$inp)
1853	stc	$i2,16($out,$inp)
1854	la	$inp,1($inp)
1855	brct	$len,.Lxts_enc_km_steal
1856
1857	la	$s2,0($i3)
1858	lghi	$s3,16
1859	lrvgr	$i1,$s0			# flip byte order
1860	lrvgr	$i2,$s1
1861	xg	$i1,0($s2)
1862	xg	$i2,8($s2)
1863	stg	$i1,0($s2)
1864	stg	$i2,8($s2)
1865	.long	0xb92e00aa		# km $s2,$s2
1866	brc	1,.-4			# can this happen?
1867	lrvgr	$i1,$s0			# flip byte order
1868	lrvgr	$i2,$s1
1869	xg	$i1,0($i3)
1870	xg	$i2,8($i3)
1871	stg	$i1,0($i3)
1872	stg	$i2,8($i3)
1873
1874.Lxts_enc_km_done:
1875	stg	$sp,$tweak+0($sp)	# wipe tweak
1876	stg	$sp,$tweak+8($sp)
1877	l${g}	$ra,5*$SIZE_T($sp)
1878	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1879	br	$ra
1880.align	16
1881.Lxts_enc_software:
1882___
1883$code.=<<___;
1884	stm${g}	%r6,$ra,6*$SIZE_T($sp)
1885
1886	slgr	$out,$inp
1887
1888	l${g}	$s3,$stdframe($sp)	# ivp
1889	llgf	$s0,0($s3)		# load iv
1890	llgf	$s1,4($s3)
1891	llgf	$s2,8($s3)
1892	llgf	$s3,12($s3)
1893	stm${g}	%r2,%r5,2*$SIZE_T($sp)
1894	la	$key,0($key2)
1895	larl	$tbl,AES_Te
1896	bras	$ra,_s390x_AES_encrypt	# generate the tweak
1897	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1898	stm	$s0,$s3,$tweak($sp)	# save the tweak
1899	j	.Lxts_enc_enter
1900
1901.align	16
1902.Lxts_enc_loop:
1903	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1904	lrvg	$s3,$tweak+8($sp)
1905	lghi	%r1,0x87
1906	srag	%r0,$s3,63		# broadcast upper bit
1907	ngr	%r1,%r0			# rem
1908	algr	$s1,$s1
1909	alcgr	$s3,$s3
1910	xgr	$s1,%r1
1911	lrvgr	$s1,$s1			# flip byte order
1912	lrvgr	$s3,$s3
1913	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1914	stg	$s1,$tweak+0($sp)	# save the tweak
1915	llgfr	$s1,$s1
1916	srlg	$s2,$s3,32
1917	stg	$s3,$tweak+8($sp)
1918	llgfr	$s3,$s3
1919	la	$inp,16($inp)		# $inp+=16
1920.Lxts_enc_enter:
1921	x	$s0,0($inp)		# ^=*($inp)
1922	x	$s1,4($inp)
1923	x	$s2,8($inp)
1924	x	$s3,12($inp)
1925	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
1926	la	$key,0($key1)
1927	bras	$ra,_s390x_AES_encrypt
1928	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1929	x	$s0,$tweak+0($sp)	# ^=tweak
1930	x	$s1,$tweak+4($sp)
1931	x	$s2,$tweak+8($sp)
1932	x	$s3,$tweak+12($sp)
1933	st	$s0,0($out,$inp)
1934	st	$s1,4($out,$inp)
1935	st	$s2,8($out,$inp)
1936	st	$s3,12($out,$inp)
1937	brct${g}	$len,.Lxts_enc_loop
1938
1939	llgc	$len,`2*$SIZE_T-1`($sp)
1940	nill	$len,0x0f		# $len%16
1941	jz	.Lxts_enc_done
1942
1943	la	$i3,0($inp,$out)	# put aside real $out
1944.Lxts_enc_steal:
1945	llgc	%r0,16($inp)
1946	llgc	%r1,0($out,$inp)
1947	stc	%r0,0($out,$inp)
1948	stc	%r1,16($out,$inp)
1949	la	$inp,1($inp)
1950	brct	$len,.Lxts_enc_steal
1951	la	$out,0($i3)		# restore real $out
1952
1953	# generate last tweak...
1954	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1955	lrvg	$s3,$tweak+8($sp)
1956	lghi	%r1,0x87
1957	srag	%r0,$s3,63		# broadcast upper bit
1958	ngr	%r1,%r0			# rem
1959	algr	$s1,$s1
1960	alcgr	$s3,$s3
1961	xgr	$s1,%r1
1962	lrvgr	$s1,$s1			# flip byte order
1963	lrvgr	$s3,$s3
1964	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1965	stg	$s1,$tweak+0($sp)	# save the tweak
1966	llgfr	$s1,$s1
1967	srlg	$s2,$s3,32
1968	stg	$s3,$tweak+8($sp)
1969	llgfr	$s3,$s3
1970
1971	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
1972	x	$s1,4($out)
1973	x	$s2,8($out)
1974	x	$s3,12($out)
1975	st${g}	$out,4*$SIZE_T($sp)
1976	la	$key,0($key1)
1977	bras	$ra,_s390x_AES_encrypt
1978	l${g}	$out,4*$SIZE_T($sp)
1979	x	$s0,`$tweak+0`($sp)	# ^=tweak
1980	x	$s1,`$tweak+4`($sp)
1981	x	$s2,`$tweak+8`($sp)
1982	x	$s3,`$tweak+12`($sp)
1983	st	$s0,0($out)
1984	st	$s1,4($out)
1985	st	$s2,8($out)
1986	st	$s3,12($out)
1987
1988.Lxts_enc_done:
1989	stg	$sp,$tweak+0($sp)	# wipe tweak
1990	stg	$sp,$tweak+8($sp)
1991	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1992	br	$ra
1993.size	AES_xts_encrypt,.-AES_xts_encrypt
1994___
1995# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1996#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
1997#	const unsigned char iv[16]);
1998#
1999$code.=<<___;
2000.globl	AES_xts_decrypt
2001.type	AES_xts_decrypt,\@function
2002.align	16
2003AES_xts_decrypt:
2004	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
2005	xgr	%r4,%r3
2006	xgr	%r3,%r4
2007___
2008$code.=<<___ if ($SIZE_T==4);
2009	llgfr	$len,$len
2010___
2011$code.=<<___;
2012	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
2013	aghi	$len,-16
2014	bcr	4,$ra			# abort if less than zero. formally
2015					# wrong, because $len is unsigned,
2016					# but who can afford asking to
2017					# process more than 2^63-1 bytes?
2018	tmll	$len,0x0f
2019	jnz	.Lxts_dec_proceed
2020	aghi	$len,16
2021.Lxts_dec_proceed:
2022___
2023$code.=<<___ if (!$softonly);
2024	llgf	%r0,240($key2)
2025	lhi	%r1,16
2026	clr	%r0,%r1
2027	jl	.Lxts_dec_software
2028
2029	st${g}	$ra,5*$SIZE_T($sp)
2030	stm${g}	%r6,$s3,6*$SIZE_T($sp)
2031
2032	nill	$len,0xfff0		# $len&=~15
2033	slgr	$out,$inp
2034
2035	# generate the tweak value
2036	l${g}	$s3,$stdframe($sp)	# pointer to iv
2037	la	$s2,$tweak($sp)
2038	lmg	$s0,$s1,0($s3)
2039	lghi	$s3,16
2040	stmg	$s0,$s1,0($s2)
2041	la	%r1,0($key2)		# $key2 is not needed past this point
2042	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
2043	brc	1,.-4			# can this happen?
2044
2045	l	%r0,240($key1)
2046	la	%r1,0($key1)		# $key1 is not needed anymore
2047
2048	ltgr	$len,$len
2049	jz	.Lxts_dec_km_short
2050	bras	$ra,_s390x_xts_km
2051	jz	.Lxts_dec_km_done
2052
2053	lrvgr	$s2,$s0			# make copy in reverse byte order
2054	lrvgr	$s3,$s1
2055	j	.Lxts_dec_km_2ndtweak
2056
2057.Lxts_dec_km_short:
2058	llgc	$len,`2*$SIZE_T-1`($sp)
2059	nill	$len,0x0f		# $len%=16
2060	lrvg	$s0,$tweak+0($sp)	# load the tweak
2061	lrvg	$s1,$tweak+8($sp)
2062	lrvgr	$s2,$s0			# make copy in reverse byte order
2063	lrvgr	$s3,$s1
2064
2065.Lxts_dec_km_2ndtweak:
2066	lghi	$i1,0x87
2067	srag	$i2,$s1,63		# broadcast upper bit
2068	ngr	$i1,$i2			# rem
2069	algr	$s0,$s0
2070	alcgr	$s1,$s1
2071	xgr	$s0,$i1
2072	lrvgr	$i1,$s0			# flip byte order
2073	lrvgr	$i2,$s1
2074
2075	xg	$i1,0($inp)
2076	xg	$i2,8($inp)
2077	stg	$i1,0($out,$inp)
2078	stg	$i2,8($out,$inp)
2079	la	$i2,0($out,$inp)
2080	lghi	$i3,16
2081	.long	0xb92e0066		# km $i2,$i2
2082	brc	1,.-4			# can this happen?
2083	lrvgr	$i1,$s0
2084	lrvgr	$i2,$s1
2085	xg	$i1,0($out,$inp)
2086	xg	$i2,8($out,$inp)
2087	stg	$i1,0($out,$inp)
2088	stg	$i2,8($out,$inp)
2089
2090	la	$i3,0($out,$inp)	# put aside real $out
2091.Lxts_dec_km_steal:
2092	llgc	$i1,16($inp)
2093	llgc	$i2,0($out,$inp)
2094	stc	$i1,0($out,$inp)
2095	stc	$i2,16($out,$inp)
2096	la	$inp,1($inp)
2097	brct	$len,.Lxts_dec_km_steal
2098
2099	lgr	$s0,$s2
2100	lgr	$s1,$s3
2101	xg	$s0,0($i3)
2102	xg	$s1,8($i3)
2103	stg	$s0,0($i3)
2104	stg	$s1,8($i3)
2105	la	$s0,0($i3)
2106	lghi	$s1,16
2107	.long	0xb92e0088		# km $s0,$s0
2108	brc	1,.-4			# can this happen?
2109	xg	$s2,0($i3)
2110	xg	$s3,8($i3)
2111	stg	$s2,0($i3)
2112	stg	$s3,8($i3)
2113.Lxts_dec_km_done:
2114	stg	$sp,$tweak+0($sp)	# wipe tweak
2115	stg	$sp,$tweak+8($sp)
2116	l${g}	$ra,5*$SIZE_T($sp)
2117	lm${g}	%r6,$s3,6*$SIZE_T($sp)
2118	br	$ra
2119.align	16
2120.Lxts_dec_software:
2121___
2122$code.=<<___;
2123	stm${g}	%r6,$ra,6*$SIZE_T($sp)
2124
2125	srlg	$len,$len,4
2126	slgr	$out,$inp
2127
2128	l${g}	$s3,$stdframe($sp)	# ivp
2129	llgf	$s0,0($s3)		# load iv
2130	llgf	$s1,4($s3)
2131	llgf	$s2,8($s3)
2132	llgf	$s3,12($s3)
2133	stm${g}	%r2,%r5,2*$SIZE_T($sp)
2134	la	$key,0($key2)
2135	larl	$tbl,AES_Te
2136	bras	$ra,_s390x_AES_encrypt	# generate the tweak
2137	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2138	larl	$tbl,AES_Td
2139	lt${g}r	$len,$len
2140	stm	$s0,$s3,$tweak($sp)	# save the tweak
2141	jz	.Lxts_dec_short
2142	j	.Lxts_dec_enter
2143
2144.align	16
2145.Lxts_dec_loop:
2146	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2147	lrvg	$s3,$tweak+8($sp)
2148	lghi	%r1,0x87
2149	srag	%r0,$s3,63		# broadcast upper bit
2150	ngr	%r1,%r0			# rem
2151	algr	$s1,$s1
2152	alcgr	$s3,$s3
2153	xgr	$s1,%r1
2154	lrvgr	$s1,$s1			# flip byte order
2155	lrvgr	$s3,$s3
2156	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2157	stg	$s1,$tweak+0($sp)	# save the tweak
2158	llgfr	$s1,$s1
2159	srlg	$s2,$s3,32
2160	stg	$s3,$tweak+8($sp)
2161	llgfr	$s3,$s3
2162.Lxts_dec_enter:
2163	x	$s0,0($inp)		# tweak^=*(inp)
2164	x	$s1,4($inp)
2165	x	$s2,8($inp)
2166	x	$s3,12($inp)
2167	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
2168	la	$key,0($key1)
2169	bras	$ra,_s390x_AES_decrypt
2170	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2171	x	$s0,$tweak+0($sp)	# ^=tweak
2172	x	$s1,$tweak+4($sp)
2173	x	$s2,$tweak+8($sp)
2174	x	$s3,$tweak+12($sp)
2175	st	$s0,0($out,$inp)
2176	st	$s1,4($out,$inp)
2177	st	$s2,8($out,$inp)
2178	st	$s3,12($out,$inp)
2179	la	$inp,16($inp)
2180	brct${g}	$len,.Lxts_dec_loop
2181
2182	llgc	$len,`2*$SIZE_T-1`($sp)
2183	nill	$len,0x0f		# $len%16
2184	jz	.Lxts_dec_done
2185
2186	# generate pair of tweaks...
2187	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2188	lrvg	$s3,$tweak+8($sp)
2189	lghi	%r1,0x87
2190	srag	%r0,$s3,63		# broadcast upper bit
2191	ngr	%r1,%r0			# rem
2192	algr	$s1,$s1
2193	alcgr	$s3,$s3
2194	xgr	$s1,%r1
2195	lrvgr	$i2,$s1			# flip byte order
2196	lrvgr	$i3,$s3
2197	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
2198	j	.Lxts_dec_2ndtweak
2199
2200.align	16
2201.Lxts_dec_short:
2202	llgc	$len,`2*$SIZE_T-1`($sp)
2203	nill	$len,0x0f		# $len%16
2204	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2205	lrvg	$s3,$tweak+8($sp)
2206.Lxts_dec_2ndtweak:
2207	lghi	%r1,0x87
2208	srag	%r0,$s3,63		# broadcast upper bit
2209	ngr	%r1,%r0			# rem
2210	algr	$s1,$s1
2211	alcgr	$s3,$s3
2212	xgr	$s1,%r1
2213	lrvgr	$s1,$s1			# flip byte order
2214	lrvgr	$s3,$s3
2215	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2216	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
2217	llgfr	$s1,$s1
2218	srlg	$s2,$s3,32
2219	stg	$s3,$tweak-16+8($sp)
2220	llgfr	$s3,$s3
2221
2222	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
2223	x	$s1,4($inp)
2224	x	$s2,8($inp)
2225	x	$s3,12($inp)
2226	stm${g}	%r2,%r3,2*$SIZE_T($sp)
2227	la	$key,0($key1)
2228	bras	$ra,_s390x_AES_decrypt
2229	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2230	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
2231	x	$s1,$tweak-16+4($sp)
2232	x	$s2,$tweak-16+8($sp)
2233	x	$s3,$tweak-16+12($sp)
2234	st	$s0,0($out,$inp)
2235	st	$s1,4($out,$inp)
2236	st	$s2,8($out,$inp)
2237	st	$s3,12($out,$inp)
2238
2239	la	$i3,0($out,$inp)	# put aside real $out
2240.Lxts_dec_steal:
2241	llgc	%r0,16($inp)
2242	llgc	%r1,0($out,$inp)
2243	stc	%r0,0($out,$inp)
2244	stc	%r1,16($out,$inp)
2245	la	$inp,1($inp)
2246	brct	$len,.Lxts_dec_steal
2247	la	$out,0($i3)		# restore real $out
2248
2249	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
2250	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
2251	x	$s1,4($out)
2252	x	$s2,8($out)
2253	x	$s3,12($out)
2254	st${g}	$out,4*$SIZE_T($sp)
2255	la	$key,0($key1)
2256	bras	$ra,_s390x_AES_decrypt
2257	l${g}	$out,4*$SIZE_T($sp)
2258	x	$s0,$tweak+0($sp)	# ^=tweak
2259	x	$s1,$tweak+4($sp)
2260	x	$s2,$tweak+8($sp)
2261	x	$s3,$tweak+12($sp)
2262	st	$s0,0($out)
2263	st	$s1,4($out)
2264	st	$s2,8($out)
2265	st	$s3,12($out)
2266	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
2267	stg	$sp,$tweak-16+8($sp)
2268.Lxts_dec_done:
2269	stg	$sp,$tweak+0($sp)	# wipe tweak
2270	stg	$sp,$tweak+8($sp)
2271	lm${g}	%r6,$ra,6*$SIZE_T($sp)
2272	br	$ra
2273.size	AES_xts_decrypt,.-AES_xts_decrypt
2274___
2275}
2276$code.=<<___;
2277.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2278___
2279
2280$code =~ s/\`([^\`]*)\`/eval $1/gem;
2281print $code;
2282close STDOUT or die "error closing STDOUT: $!";	# force flush
2283