1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
5# <appro@openssl.org>. The module is licensed under 2-clause BSD
6# license. October 2012. All rights reserved.
7# ====================================================================
8
9######################################################################
10# AES for SPARC T4.
11#
12# AES round instructions complete in 3 cycles and can be issued every
13# cycle. It means that round calculations should take 4*rounds cycles,
14# because any given round instruction depends on result of *both*
15# previous instructions:
16#
17#	|0 |1 |2 |3 |4
18#	|01|01|01|
19#	   |23|23|23|
20#	            |01|01|...
21#	               |23|...
22#
23# Provided that fxor [with IV] takes 3 cycles to complete, critical
24# path length for CBC encrypt would be 3+4*rounds, or in other words
25# it should process one byte in at least (3+4*rounds)/16 cycles. This
26# estimate doesn't account for "collateral" instructions, such as
27# fetching input from memory, xor-ing it with zero-round key and
28# storing the result. Yet, *measured* performance [for data aligned
29# at 64-bit boundary!] deviates from this equation by less than 0.5%:
30#
31#		128-bit key	192-		256-
32# CBC encrypt	2.70/2.90(*)	3.20/3.40	3.70/3.90
33#			 (*) numbers after slash are for
34#			     misaligned data.
35#
36# Out-of-order execution logic managed to fully overlap "collateral"
37# instructions with those on critical path. Amazing!
38#
39# As with Intel AES-NI, question is if it's possible to improve
40# performance of parallelizeable modes by interleaving round
41# instructions. Provided round instruction latency and throughput
42# optimal interleave factor is 2. But can we expect 2x performance
43# improvement? Well, as round instructions can be issued one per
44# cycle, they don't saturate the 2-way issue pipeline and therefore
45# there is room for "collateral" calculations... Yet, 2x speed-up
46# over CBC encrypt remains unattaintable:
47#
48#		128-bit key	192-		256-
49# CBC decrypt	1.64/2.11	1.89/2.37	2.23/2.61
50# CTR		1.64/2.08(*)	1.89/2.33	2.23/2.61
51#			 (*) numbers after slash are for
52#			     misaligned data.
53#
54# Estimates based on amount of instructions under assumption that
55# round instructions are not pairable with any other instruction
56# suggest that latter is the actual case and pipeline runs
57# underutilized. It should be noted that T4 out-of-order execution
58# logic is so capable that performance gain from 2x interleave is
59# not even impressive, ~7-13% over non-interleaved code, largest
60# for 256-bit keys.
61
62# To anchor to something else, software implementation processes
63# one byte in 29 cycles with 128-bit key on same processor. Intel
64# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
65# in 0.93, naturally with AES-NI.
66
67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68push(@INC,"${dir}","${dir}../../perlasm");
69require "sparcv9_modes.pl";
70
71&asm_init(@ARGV);
72
73$::evp=1;	# if $evp is set to 0, script generates module with
74# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
75# points. These however are not fully compatible with openssl/aes.h,
76# because they expect AES_KEY to be aligned at 64-bit boundary. When
77# used through EVP, alignment is arranged at EVP layer. Second thing
78# that is arranged by EVP is at least 32-bit alignment of IV.
79
80######################################################################
81# single-round subroutines
82#
83{
84my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
85
86$code.=<<___ if ($::abibits==64);
87.register	%g2,#scratch
88.register	%g3,#scratch
89
90___
91$code.=<<___;
92.text
93
94.globl	aes_t4_encrypt
95.align	32
96aes_t4_encrypt:
97	andcc		$inp, 7, %g1		! is input aligned?
98	andn		$inp, 7, $inp
99
100	ldx		[$key + 0], %g4
101	ldx		[$key + 8], %g5
102
103	ldx		[$inp + 0], %o4
104	bz,pt		%icc, 1f
105	ldx		[$inp + 8], %o5
106	ldx		[$inp + 16], $inp
107	sll		%g1, 3, %g1
108	sub		%g0, %g1, %o3
109	sllx		%o4, %g1, %o4
110	sllx		%o5, %g1, %g1
111	srlx		%o5, %o3, %o5
112	srlx		$inp, %o3, %o3
113	or		%o5, %o4, %o4
114	or		%o3, %g1, %o5
1151:
116	ld		[$key + 240], $rounds
117	ldd		[$key + 16], %f12
118	ldd		[$key + 24], %f14
119	xor		%g4, %o4, %o4
120	xor		%g5, %o5, %o5
121	movxtod		%o4, %f0
122	movxtod		%o5, %f2
123	srl		$rounds, 1, $rounds
124	ldd		[$key + 32], %f16
125	sub		$rounds, 1, $rounds
126	ldd		[$key + 40], %f18
127	add		$key, 48, $key
128
129.Lenc:
130	aes_eround01	%f12, %f0, %f2, %f4
131	aes_eround23	%f14, %f0, %f2, %f2
132	ldd		[$key + 0], %f12
133	ldd		[$key + 8], %f14
134	sub		$rounds,1,$rounds
135	aes_eround01	%f16, %f4, %f2, %f0
136	aes_eround23	%f18, %f4, %f2, %f2
137	ldd		[$key + 16], %f16
138	ldd		[$key + 24], %f18
139	brnz,pt		$rounds, .Lenc
140	add		$key, 32, $key
141
142	andcc		$out, 7, $tmp		! is output aligned?
143	aes_eround01	%f12, %f0, %f2, %f4
144	aes_eround23	%f14, %f0, %f2, %f2
145	aes_eround01_l	%f16, %f4, %f2, %f0
146	aes_eround23_l	%f18, %f4, %f2, %f2
147
148	bnz,pn		%icc, 2f
149	nop
150
151	std		%f0, [$out + 0]
152	retl
153	std		%f2, [$out + 8]
154
1552:	alignaddrl	$out, %g0, $out
156	mov		0xff, $mask
157	srl		$mask, $tmp, $mask
158
159	faligndata	%f0, %f0, %f4
160	faligndata	%f0, %f2, %f6
161	faligndata	%f2, %f2, %f8
162
163	stda		%f4, [$out + $mask]0xc0	! partial store
164	std		%f6, [$out + 8]
165	add		$out, 16, $out
166	orn		%g0, $mask, $mask
167	retl
168	stda		%f8, [$out + $mask]0xc0	! partial store
169.type	aes_t4_encrypt,#function
170.size	aes_t4_encrypt,.-aes_t4_encrypt
171
172.globl	aes_t4_decrypt
173.align	32
174aes_t4_decrypt:
175	andcc		$inp, 7, %g1		! is input aligned?
176	andn		$inp, 7, $inp
177
178	ldx		[$key + 0], %g4
179	ldx		[$key + 8], %g5
180
181	ldx		[$inp + 0], %o4
182	bz,pt		%icc, 1f
183	ldx		[$inp + 8], %o5
184	ldx		[$inp + 16], $inp
185	sll		%g1, 3, %g1
186	sub		%g0, %g1, %o3
187	sllx		%o4, %g1, %o4
188	sllx		%o5, %g1, %g1
189	srlx		%o5, %o3, %o5
190	srlx		$inp, %o3, %o3
191	or		%o5, %o4, %o4
192	or		%o3, %g1, %o5
1931:
194	ld		[$key + 240], $rounds
195	ldd		[$key + 16], %f12
196	ldd		[$key + 24], %f14
197	xor		%g4, %o4, %o4
198	xor		%g5, %o5, %o5
199	movxtod		%o4, %f0
200	movxtod		%o5, %f2
201	srl		$rounds, 1, $rounds
202	ldd		[$key + 32], %f16
203	sub		$rounds, 1, $rounds
204	ldd		[$key + 40], %f18
205	add		$key, 48, $key
206
207.Ldec:
208	aes_dround01	%f12, %f0, %f2, %f4
209	aes_dround23	%f14, %f0, %f2, %f2
210	ldd		[$key + 0], %f12
211	ldd		[$key + 8], %f14
212	sub		$rounds,1,$rounds
213	aes_dround01	%f16, %f4, %f2, %f0
214	aes_dround23	%f18, %f4, %f2, %f2
215	ldd		[$key + 16], %f16
216	ldd		[$key + 24], %f18
217	brnz,pt		$rounds, .Ldec
218	add		$key, 32, $key
219
220	andcc		$out, 7, $tmp		! is output aligned?
221	aes_dround01	%f12, %f0, %f2, %f4
222	aes_dround23	%f14, %f0, %f2, %f2
223	aes_dround01_l	%f16, %f4, %f2, %f0
224	aes_dround23_l	%f18, %f4, %f2, %f2
225
226	bnz,pn		%icc, 2f
227	nop
228
229	std		%f0, [$out + 0]
230	retl
231	std		%f2, [$out + 8]
232
2332:	alignaddrl	$out, %g0, $out
234	mov		0xff, $mask
235	srl		$mask, $tmp, $mask
236
237	faligndata	%f0, %f0, %f4
238	faligndata	%f0, %f2, %f6
239	faligndata	%f2, %f2, %f8
240
241	stda		%f4, [$out + $mask]0xc0	! partial store
242	std		%f6, [$out + 8]
243	add		$out, 16, $out
244	orn		%g0, $mask, $mask
245	retl
246	stda		%f8, [$out + $mask]0xc0	! partial store
247.type	aes_t4_decrypt,#function
248.size	aes_t4_decrypt,.-aes_t4_decrypt
249___
250}
251
252######################################################################
253# key setup subroutines
254#
255{
256my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
257$code.=<<___;
258.globl	aes_t4_set_encrypt_key
259.align	32
260aes_t4_set_encrypt_key:
261.Lset_encrypt_key:
262	and		$inp, 7, $tmp
263	alignaddr	$inp, %g0, $inp
264	cmp		$bits, 192
265	ldd		[$inp + 0], %f0
266	bl,pt		%icc,.L128
267	ldd		[$inp + 8], %f2
268
269	be,pt		%icc,.L192
270	ldd		[$inp + 16], %f4
271	brz,pt		$tmp, .L256aligned
272	ldd		[$inp + 24], %f6
273
274	ldd		[$inp + 32], %f8
275	faligndata	%f0, %f2, %f0
276	faligndata	%f2, %f4, %f2
277	faligndata	%f4, %f6, %f4
278	faligndata	%f6, %f8, %f6
279.L256aligned:
280___
281for ($i=0; $i<6; $i++) {
282    $code.=<<___;
283	std		%f0, [$out + `32*$i+0`]
284	aes_kexpand1	%f0, %f6, $i, %f0
285	std		%f2, [$out + `32*$i+8`]
286	aes_kexpand2	%f2, %f0, %f2
287	std		%f4, [$out + `32*$i+16`]
288	aes_kexpand0	%f4, %f2, %f4
289	std		%f6, [$out + `32*$i+24`]
290	aes_kexpand2	%f6, %f4, %f6
291___
292}
293$code.=<<___;
294	std		%f0, [$out + `32*$i+0`]
295	aes_kexpand1	%f0, %f6, $i, %f0
296	std		%f2, [$out + `32*$i+8`]
297	aes_kexpand2	%f2, %f0, %f2
298	std		%f4, [$out + `32*$i+16`]
299	std		%f6, [$out + `32*$i+24`]
300	std		%f0, [$out + `32*$i+32`]
301	std		%f2, [$out + `32*$i+40`]
302
303	mov		14, $tmp
304	st		$tmp, [$out + 240]
305	retl
306	xor		%o0, %o0, %o0
307
308.align	16
309.L192:
310	brz,pt		$tmp, .L192aligned
311	nop
312
313	ldd		[$inp + 24], %f6
314	faligndata	%f0, %f2, %f0
315	faligndata	%f2, %f4, %f2
316	faligndata	%f4, %f6, %f4
317.L192aligned:
318___
319for ($i=0; $i<7; $i++) {
320    $code.=<<___;
321	std		%f0, [$out + `24*$i+0`]
322	aes_kexpand1	%f0, %f4, $i, %f0
323	std		%f2, [$out + `24*$i+8`]
324	aes_kexpand2	%f2, %f0, %f2
325	std		%f4, [$out + `24*$i+16`]
326	aes_kexpand2	%f4, %f2, %f4
327___
328}
329$code.=<<___;
330	std		%f0, [$out + `24*$i+0`]
331	aes_kexpand1	%f0, %f4, $i, %f0
332	std		%f2, [$out + `24*$i+8`]
333	aes_kexpand2	%f2, %f0, %f2
334	std		%f4, [$out + `24*$i+16`]
335	std		%f0, [$out + `24*$i+24`]
336	std		%f2, [$out + `24*$i+32`]
337
338	mov		12, $tmp
339	st		$tmp, [$out + 240]
340	retl
341	xor		%o0, %o0, %o0
342
343.align	16
344.L128:
345	brz,pt		$tmp, .L128aligned
346	nop
347
348	ldd		[$inp + 16], %f4
349	faligndata	%f0, %f2, %f0
350	faligndata	%f2, %f4, %f2
351.L128aligned:
352___
353for ($i=0; $i<10; $i++) {
354    $code.=<<___;
355	std		%f0, [$out + `16*$i+0`]
356	aes_kexpand1	%f0, %f2, $i, %f0
357	std		%f2, [$out + `16*$i+8`]
358	aes_kexpand2	%f2, %f0, %f2
359___
360}
361$code.=<<___;
362	std		%f0, [$out + `16*$i+0`]
363	std		%f2, [$out + `16*$i+8`]
364
365	mov		10, $tmp
366	st		$tmp, [$out + 240]
367	retl
368	xor		%o0, %o0, %o0
369.type	aes_t4_set_encrypt_key,#function
370.size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
371
372.globl	aes_t4_set_decrypt_key
373.align	32
374aes_t4_set_decrypt_key:
375	mov		%o7, %o5
376	call		.Lset_encrypt_key
377	nop
378
379	mov		%o5, %o7
380	sll		$tmp, 4, $inp		! $tmp is number of rounds
381	add		$tmp, 2, $tmp
382	add		$out, $inp, $inp	! $inp=$out+16*rounds
383	srl		$tmp, 2, $tmp		! $tmp=(rounds+2)/4
384
385.Lkey_flip:
386	ldd		[$out + 0],  %f0
387	ldd		[$out + 8],  %f2
388	ldd		[$out + 16], %f4
389	ldd		[$out + 24], %f6
390	ldd		[$inp + 0],  %f8
391	ldd		[$inp + 8],  %f10
392	ldd		[$inp - 16], %f12
393	ldd		[$inp - 8],  %f14
394	sub		$tmp, 1, $tmp
395	std		%f0, [$inp + 0]
396	std		%f2, [$inp + 8]
397	std		%f4, [$inp - 16]
398	std		%f6, [$inp - 8]
399	std		%f8, [$out + 0]
400	std		%f10, [$out + 8]
401	std		%f12, [$out + 16]
402	std		%f14, [$out + 24]
403	add		$out, 32, $out
404	brnz		$tmp, .Lkey_flip
405	sub		$inp, 32, $inp
406
407	retl
408	xor		%o0, %o0, %o0
409.type	aes_t4_set_decrypt_key,#function
410.size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
411___
412}
413
414{{{
415my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
416my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
417
418$code.=<<___;
419.align	32
420_aes128_encrypt_1x:
421___
422for ($i=0; $i<4; $i++) {
423    $code.=<<___;
424	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
425	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
426	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
427	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
428___
429}
430$code.=<<___;
431	aes_eround01	%f48, %f0, %f2, %f4
432	aes_eround23	%f50, %f0, %f2, %f2
433	aes_eround01_l	%f52, %f4, %f2, %f0
434	retl
435	aes_eround23_l	%f54, %f4, %f2, %f2
436.type	_aes128_encrypt_1x,#function
437.size	_aes128_encrypt_1x,.-_aes128_encrypt_1x
438
439.align	32
440_aes128_encrypt_2x:
441___
442for ($i=0; $i<4; $i++) {
443    $code.=<<___;
444	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
445	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
446	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
447	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
448	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
449	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
450	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
451	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
452___
453}
454$code.=<<___;
455	aes_eround01	%f48, %f0, %f2, %f8
456	aes_eround23	%f50, %f0, %f2, %f2
457	aes_eround01	%f48, %f4, %f6, %f10
458	aes_eround23	%f50, %f4, %f6, %f6
459	aes_eround01_l	%f52, %f8, %f2, %f0
460	aes_eround23_l	%f54, %f8, %f2, %f2
461	aes_eround01_l	%f52, %f10, %f6, %f4
462	retl
463	aes_eround23_l	%f54, %f10, %f6, %f6
464.type	_aes128_encrypt_2x,#function
465.size	_aes128_encrypt_2x,.-_aes128_encrypt_2x
466
467.align	32
468_aes128_loadkey:
469	ldx		[$key + 0], %g4
470	ldx		[$key + 8], %g5
471___
472for ($i=2; $i<22;$i++) {			# load key schedule
473    $code.=<<___;
474	ldd		[$key + `8*$i`], %f`12+2*$i`
475___
476}
477$code.=<<___;
478	retl
479	nop
480.type	_aes128_loadkey,#function
481.size	_aes128_loadkey,.-_aes128_loadkey
482_aes128_load_enckey=_aes128_loadkey
483_aes128_load_deckey=_aes128_loadkey
484
485___
486
487&alg_cbc_encrypt_implement("aes",128);
488if ($::evp) {
489    &alg_ctr32_implement("aes",128);
490    &alg_xts_implement("aes",128,"en");
491    &alg_xts_implement("aes",128,"de");
492}
493&alg_cbc_decrypt_implement("aes",128);
494
495$code.=<<___;
496.align	32
497_aes128_decrypt_1x:
498___
499for ($i=0; $i<4; $i++) {
500    $code.=<<___;
501	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
502	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
503	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
504	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
505___
506}
507$code.=<<___;
508	aes_dround01	%f48, %f0, %f2, %f4
509	aes_dround23	%f50, %f0, %f2, %f2
510	aes_dround01_l	%f52, %f4, %f2, %f0
511	retl
512	aes_dround23_l	%f54, %f4, %f2, %f2
513.type	_aes128_decrypt_1x,#function
514.size	_aes128_decrypt_1x,.-_aes128_decrypt_1x
515
516.align	32
517_aes128_decrypt_2x:
518___
519for ($i=0; $i<4; $i++) {
520    $code.=<<___;
521	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
522	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
523	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
524	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
525	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
526	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
527	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
528	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
529___
530}
531$code.=<<___;
532	aes_dround01	%f48, %f0, %f2, %f8
533	aes_dround23	%f50, %f0, %f2, %f2
534	aes_dround01	%f48, %f4, %f6, %f10
535	aes_dround23	%f50, %f4, %f6, %f6
536	aes_dround01_l	%f52, %f8, %f2, %f0
537	aes_dround23_l	%f54, %f8, %f2, %f2
538	aes_dround01_l	%f52, %f10, %f6, %f4
539	retl
540	aes_dround23_l	%f54, %f10, %f6, %f6
541.type	_aes128_decrypt_2x,#function
542.size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
543___
544
545$code.=<<___;
546.align	32
547_aes192_encrypt_1x:
548___
549for ($i=0; $i<5; $i++) {
550    $code.=<<___;
551	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
552	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
553	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
554	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
555___
556}
557$code.=<<___;
558	aes_eround01	%f56, %f0, %f2, %f4
559	aes_eround23	%f58, %f0, %f2, %f2
560	aes_eround01_l	%f60, %f4, %f2, %f0
561	retl
562	aes_eround23_l	%f62, %f4, %f2, %f2
563.type	_aes192_encrypt_1x,#function
564.size	_aes192_encrypt_1x,.-_aes192_encrypt_1x
565
566.align	32
567_aes192_encrypt_2x:
568___
569for ($i=0; $i<5; $i++) {
570    $code.=<<___;
571	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
572	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
573	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
574	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
575	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
576	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
577	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
578	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
579___
580}
581$code.=<<___;
582	aes_eround01	%f56, %f0, %f2, %f8
583	aes_eround23	%f58, %f0, %f2, %f2
584	aes_eround01	%f56, %f4, %f6, %f10
585	aes_eround23	%f58, %f4, %f6, %f6
586	aes_eround01_l	%f60, %f8, %f2, %f0
587	aes_eround23_l	%f62, %f8, %f2, %f2
588	aes_eround01_l	%f60, %f10, %f6, %f4
589	retl
590	aes_eround23_l	%f62, %f10, %f6, %f6
591.type	_aes192_encrypt_2x,#function
592.size	_aes192_encrypt_2x,.-_aes192_encrypt_2x
593
594.align	32
595_aes256_encrypt_1x:
596	aes_eround01	%f16, %f0, %f2, %f4
597	aes_eround23	%f18, %f0, %f2, %f2
598	ldd		[$key + 208], %f16
599	ldd		[$key + 216], %f18
600	aes_eround01	%f20, %f4, %f2, %f0
601	aes_eround23	%f22, %f4, %f2, %f2
602	ldd		[$key + 224], %f20
603	ldd		[$key + 232], %f22
604___
605for ($i=1; $i<6; $i++) {
606    $code.=<<___;
607	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
608	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
609	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
610	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
611___
612}
613$code.=<<___;
614	aes_eround01	%f16, %f0, %f2, %f4
615	aes_eround23	%f18, %f0, %f2, %f2
616	ldd		[$key + 16], %f16
617	ldd		[$key + 24], %f18
618	aes_eround01_l	%f20, %f4, %f2, %f0
619	aes_eround23_l	%f22, %f4, %f2, %f2
620	ldd		[$key + 32], %f20
621	retl
622	ldd		[$key + 40], %f22
623.type	_aes256_encrypt_1x,#function
624.size	_aes256_encrypt_1x,.-_aes256_encrypt_1x
625
626.align	32
627_aes256_encrypt_2x:
628	aes_eround01	%f16, %f0, %f2, %f8
629	aes_eround23	%f18, %f0, %f2, %f2
630	aes_eround01	%f16, %f4, %f6, %f10
631	aes_eround23	%f18, %f4, %f6, %f6
632	ldd		[$key + 208], %f16
633	ldd		[$key + 216], %f18
634	aes_eround01	%f20, %f8, %f2, %f0
635	aes_eround23	%f22, %f8, %f2, %f2
636	aes_eround01	%f20, %f10, %f6, %f4
637	aes_eround23	%f22, %f10, %f6, %f6
638	ldd		[$key + 224], %f20
639	ldd		[$key + 232], %f22
640___
641for ($i=1; $i<6; $i++) {
642    $code.=<<___;
643	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
644	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
645	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
646	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
647	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
648	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
649	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
650	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
651___
652}
653$code.=<<___;
654	aes_eround01	%f16, %f0, %f2, %f8
655	aes_eround23	%f18, %f0, %f2, %f2
656	aes_eround01	%f16, %f4, %f6, %f10
657	aes_eround23	%f18, %f4, %f6, %f6
658	ldd		[$key + 16], %f16
659	ldd		[$key + 24], %f18
660	aes_eround01_l	%f20, %f8, %f2, %f0
661	aes_eround23_l	%f22, %f8, %f2, %f2
662	aes_eround01_l	%f20, %f10, %f6, %f4
663	aes_eround23_l	%f22, %f10, %f6, %f6
664	ldd		[$key + 32], %f20
665	retl
666	ldd		[$key + 40], %f22
667.type	_aes256_encrypt_2x,#function
668.size	_aes256_encrypt_2x,.-_aes256_encrypt_2x
669
670.align	32
671_aes192_loadkey:
672	ldx		[$key + 0], %g4
673	ldx		[$key + 8], %g5
674___
675for ($i=2; $i<26;$i++) {			# load key schedule
676    $code.=<<___;
677	ldd		[$key + `8*$i`], %f`12+2*$i`
678___
679}
680$code.=<<___;
681	retl
682	nop
683.type	_aes192_loadkey,#function
684.size	_aes192_loadkey,.-_aes192_loadkey
685_aes256_loadkey=_aes192_loadkey
686_aes192_load_enckey=_aes192_loadkey
687_aes192_load_deckey=_aes192_loadkey
688_aes256_load_enckey=_aes192_loadkey
689_aes256_load_deckey=_aes192_loadkey
690___
691
692&alg_cbc_encrypt_implement("aes",256);
693&alg_cbc_encrypt_implement("aes",192);
694if ($::evp) {
695    &alg_ctr32_implement("aes",256);
696    &alg_xts_implement("aes",256,"en");
697    &alg_xts_implement("aes",256,"de");
698    &alg_ctr32_implement("aes",192);
699}
700&alg_cbc_decrypt_implement("aes",192);
701&alg_cbc_decrypt_implement("aes",256);
702
703$code.=<<___;
704.align	32
705_aes256_decrypt_1x:
706	aes_dround01	%f16, %f0, %f2, %f4
707	aes_dround23	%f18, %f0, %f2, %f2
708	ldd		[$key + 208], %f16
709	ldd		[$key + 216], %f18
710	aes_dround01	%f20, %f4, %f2, %f0
711	aes_dround23	%f22, %f4, %f2, %f2
712	ldd		[$key + 224], %f20
713	ldd		[$key + 232], %f22
714___
715for ($i=1; $i<6; $i++) {
716    $code.=<<___;
717	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
718	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
719	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
720	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
721___
722}
723$code.=<<___;
724	aes_dround01	%f16, %f0, %f2, %f4
725	aes_dround23	%f18, %f0, %f2, %f2
726	ldd		[$key + 16], %f16
727	ldd		[$key + 24], %f18
728	aes_dround01_l	%f20, %f4, %f2, %f0
729	aes_dround23_l	%f22, %f4, %f2, %f2
730	ldd		[$key + 32], %f20
731	retl
732	ldd		[$key + 40], %f22
733.type	_aes256_decrypt_1x,#function
734.size	_aes256_decrypt_1x,.-_aes256_decrypt_1x
735
736.align	32
737_aes256_decrypt_2x:
738	aes_dround01	%f16, %f0, %f2, %f8
739	aes_dround23	%f18, %f0, %f2, %f2
740	aes_dround01	%f16, %f4, %f6, %f10
741	aes_dround23	%f18, %f4, %f6, %f6
742	ldd		[$key + 208], %f16
743	ldd		[$key + 216], %f18
744	aes_dround01	%f20, %f8, %f2, %f0
745	aes_dround23	%f22, %f8, %f2, %f2
746	aes_dround01	%f20, %f10, %f6, %f4
747	aes_dround23	%f22, %f10, %f6, %f6
748	ldd		[$key + 224], %f20
749	ldd		[$key + 232], %f22
750___
751for ($i=1; $i<6; $i++) {
752    $code.=<<___;
753	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
754	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
755	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
756	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
757	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
758	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
759	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
760	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
761___
762}
763$code.=<<___;
764	aes_dround01	%f16, %f0, %f2, %f8
765	aes_dround23	%f18, %f0, %f2, %f2
766	aes_dround01	%f16, %f4, %f6, %f10
767	aes_dround23	%f18, %f4, %f6, %f6
768	ldd		[$key + 16], %f16
769	ldd		[$key + 24], %f18
770	aes_dround01_l	%f20, %f8, %f2, %f0
771	aes_dround23_l	%f22, %f8, %f2, %f2
772	aes_dround01_l	%f20, %f10, %f6, %f4
773	aes_dround23_l	%f22, %f10, %f6, %f6
774	ldd		[$key + 32], %f20
775	retl
776	ldd		[$key + 40], %f22
777.type	_aes256_decrypt_2x,#function
778.size	_aes256_decrypt_2x,.-_aes256_decrypt_2x
779
780.align	32
781_aes192_decrypt_1x:
782___
783for ($i=0; $i<5; $i++) {
784    $code.=<<___;
785	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
786	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
787	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
788	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
789___
790}
791$code.=<<___;
792	aes_dround01	%f56, %f0, %f2, %f4
793	aes_dround23	%f58, %f0, %f2, %f2
794	aes_dround01_l	%f60, %f4, %f2, %f0
795	retl
796	aes_dround23_l	%f62, %f4, %f2, %f2
797.type	_aes192_decrypt_1x,#function
798.size	_aes192_decrypt_1x,.-_aes192_decrypt_1x
799
800.align	32
801_aes192_decrypt_2x:
802___
803for ($i=0; $i<5; $i++) {
804    $code.=<<___;
805	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
806	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
807	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
808	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
809	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
810	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
811	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
812	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
813___
814}
815$code.=<<___;
816	aes_dround01	%f56, %f0, %f2, %f8
817	aes_dround23	%f58, %f0, %f2, %f2
818	aes_dround01	%f56, %f4, %f6, %f10
819	aes_dround23	%f58, %f4, %f6, %f6
820	aes_dround01_l	%f60, %f8, %f2, %f0
821	aes_dround23_l	%f62, %f8, %f2, %f2
822	aes_dround01_l	%f60, %f10, %f6, %f4
823	retl
824	aes_dround23_l	%f62, %f10, %f6, %f6
825.type	_aes192_decrypt_2x,#function
826.size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
827___
828}}}
829
830if (!$::evp) {
831$code.=<<___;
832.global	AES_encrypt
833AES_encrypt=aes_t4_encrypt
834.global	AES_decrypt
835AES_decrypt=aes_t4_decrypt
836.global	AES_set_encrypt_key
837.align	32
838AES_set_encrypt_key:
839	andcc		%o2, 7, %g0		! check alignment
840	bnz,a,pn	%icc, 1f
841	mov		-1, %o0
842	brz,a,pn	%o0, 1f
843	mov		-1, %o0
844	brz,a,pn	%o2, 1f
845	mov		-1, %o0
846	andncc		%o1, 0x1c0, %g0
847	bnz,a,pn	%icc, 1f
848	mov		-2, %o0
849	cmp		%o1, 128
850	bl,a,pn		%icc, 1f
851	mov		-2, %o0
852	b		aes_t4_set_encrypt_key
853	nop
8541:	retl
855	nop
856.type	AES_set_encrypt_key,#function
857.size	AES_set_encrypt_key,.-AES_set_encrypt_key
858
859.global	AES_set_decrypt_key
860.align	32
861AES_set_decrypt_key:
862	andcc		%o2, 7, %g0		! check alignment
863	bnz,a,pn	%icc, 1f
864	mov		-1, %o0
865	brz,a,pn	%o0, 1f
866	mov		-1, %o0
867	brz,a,pn	%o2, 1f
868	mov		-1, %o0
869	andncc		%o1, 0x1c0, %g0
870	bnz,a,pn	%icc, 1f
871	mov		-2, %o0
872	cmp		%o1, 128
873	bl,a,pn		%icc, 1f
874	mov		-2, %o0
875	b		aes_t4_set_decrypt_key
876	nop
8771:	retl
878	nop
879.type	AES_set_decrypt_key,#function
880.size	AES_set_decrypt_key,.-AES_set_decrypt_key
881___
882
883my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
884
885$code.=<<___;
886.globl	AES_cbc_encrypt
887.align	32
888AES_cbc_encrypt:
889	ld		[$key + 240], %g1
890	nop
891	brz		$enc, .Lcbc_decrypt
892	cmp		%g1, 12
893
894	bl,pt		%icc, aes128_t4_cbc_encrypt
895	nop
896	be,pn		%icc, aes192_t4_cbc_encrypt
897	nop
898	ba		aes256_t4_cbc_encrypt
899	nop
900
901.Lcbc_decrypt:
902	bl,pt		%icc, aes128_t4_cbc_decrypt
903	nop
904	be,pn		%icc, aes192_t4_cbc_decrypt
905	nop
906	ba		aes256_t4_cbc_decrypt
907	nop
908.type	AES_cbc_encrypt,#function
909.size	AES_cbc_encrypt,.-AES_cbc_encrypt
910___
911}
912$code.=<<___;
913.asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
914.align	4
915___
916
917&emit_assembler();
918
919close STDOUT;
920