1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# Version 1.1
17#
18# The major reason for undertaken effort was to mitigate the hazard of
19# cache-timing attack. This is [currently and initially!] addressed in
20# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
21# 2. References to them are scheduled for L2 cache latency, meaning
22# that the tables don't have to reside in L1 cache. Once again, this
23# is an initial draft and one should expect more countermeasures to
24# be implemented...
25#
26# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
27# round.
28#
29# Even though performance was not the primary goal [on the contrary,
30# extra shifts "induced" by compressed S-box and longer loop epilogue
31# "induced" by scheduling for L2 have negative effect on performance],
32# the code turned out to run in ~23 cycles per processed byte en-/
33# decrypted with 128-bit key. This is pretty good result for code
34# with mentioned qualities and UltraSPARC core. Compared to Sun C
35# generated code my encrypt procedure runs just few percents faster,
36# while decrypt one - whole 50% faster [yes, Sun C failed to generate
37# optimal decrypt procedure]. Compared to GNU C generated code both
38# procedures are more than 60% faster:-)
39
40$output = pop;
41open STDOUT,">$output";
42
43$frame="STACK_FRAME";
44$bias="STACK_BIAS";
45$locals=16;
46
47$acc0="%l0";
48$acc1="%o0";
49$acc2="%o1";
50$acc3="%o2";
51
52$acc4="%l1";
53$acc5="%o3";
54$acc6="%o4";
55$acc7="%o5";
56
57$acc8="%l2";
58$acc9="%o7";
59$acc10="%g1";
60$acc11="%g2";
61
62$acc12="%l3";
63$acc13="%g3";
64$acc14="%g4";
65$acc15="%g5";
66
67$t0="%l4";
68$t1="%l5";
69$t2="%l6";
70$t3="%l7";
71
72$s0="%i0";
73$s1="%i1";
74$s2="%i2";
75$s3="%i3";
76$tbl="%i4";
77$key="%i5";
78$rounds="%i7";	# aliases with return address, which is off-loaded to stack
79
80sub _data_word()
81{ my $i;
82    while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
83}
84
85$code.=<<___;
86#include "sparc_arch.h"
87
88#ifdef  __arch64__
89.register	%g2,#scratch
90.register	%g3,#scratch
91#endif
92.section	".text",#alloc,#execinstr
93
94.align	256
95AES_Te:
96___
97&_data_word(
98	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
99	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
100	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
101	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
102	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
103	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
104	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
105	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
106	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
107	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
108	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
109	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
110	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
111	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
112	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
113	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
114	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
115	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
116	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
117	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
118	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
119	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
120	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
121	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
122	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
123	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
124	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
125	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
126	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
127	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
128	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
129	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
130	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
131	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
132	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
133	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
134	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
135	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
136	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
137	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
138	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
139	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
140	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
141	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
142	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
143	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
144	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
145	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
146	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
147	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
148	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
149	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
150	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
151	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
152	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
153	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
154	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
155	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
156	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
157	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
158	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
159	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
160	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
161	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
162$code.=<<___;
163	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
164	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
165	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
166	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
167	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
168	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
169	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
170	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
171	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
172	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
173	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
174	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
175	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
176	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
177	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
178	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
179	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
180	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
181	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
182	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
183	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
184	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
185	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
186	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
187	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
188	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
189	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
190	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
191	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
192	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
193	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
194	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
195.type	AES_Te,#object
196.size	AES_Te,(.-AES_Te)
197
198.align	64
199.skip	16
200_sparcv9_AES_encrypt:
201	save	%sp,-$frame-$locals,%sp
202	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
203	ld	[$key+240],$rounds
204	ld	[$key+0],$t0
205	ld	[$key+4],$t1			!
206	ld	[$key+8],$t2
207	srl	$rounds,1,$rounds
208	xor	$t0,$s0,$s0
209	ld	[$key+12],$t3
210	srl	$s0,21,$acc0
211	xor	$t1,$s1,$s1
212	ld	[$key+16],$t0
213	srl	$s1,13,$acc1			!
214	xor	$t2,$s2,$s2
215	ld	[$key+20],$t1
216	xor	$t3,$s3,$s3
217	ld	[$key+24],$t2
218	and	$acc0,2040,$acc0
219	ld	[$key+28],$t3
220	nop
221.Lenc_loop:
222	srl	$s2,5,$acc2			!
223	and	$acc1,2040,$acc1
224	ldx	[$tbl+$acc0],$acc0
225	sll	$s3,3,$acc3
226	and	$acc2,2040,$acc2
227	ldx	[$tbl+$acc1],$acc1
228	srl	$s1,21,$acc4
229	and	$acc3,2040,$acc3
230	ldx	[$tbl+$acc2],$acc2		!
231	srl	$s2,13,$acc5
232	and	$acc4,2040,$acc4
233	ldx	[$tbl+$acc3],$acc3
234	srl	$s3,5,$acc6
235	and	$acc5,2040,$acc5
236	ldx	[$tbl+$acc4],$acc4
237	fmovs	%f0,%f0
238	sll	$s0,3,$acc7			!
239	and	$acc6,2040,$acc6
240	ldx	[$tbl+$acc5],$acc5
241	srl	$s2,21,$acc8
242	and	$acc7,2040,$acc7
243	ldx	[$tbl+$acc6],$acc6
244	srl	$s3,13,$acc9
245	and	$acc8,2040,$acc8
246	ldx	[$tbl+$acc7],$acc7		!
247	srl	$s0,5,$acc10
248	and	$acc9,2040,$acc9
249	ldx	[$tbl+$acc8],$acc8
250	sll	$s1,3,$acc11
251	and	$acc10,2040,$acc10
252	ldx	[$tbl+$acc9],$acc9
253	fmovs	%f0,%f0
254	srl	$s3,21,$acc12			!
255	and	$acc11,2040,$acc11
256	ldx	[$tbl+$acc10],$acc10
257	srl	$s0,13,$acc13
258	and	$acc12,2040,$acc12
259	ldx	[$tbl+$acc11],$acc11
260	srl	$s1,5,$acc14
261	and	$acc13,2040,$acc13
262	ldx	[$tbl+$acc12],$acc12		!
263	sll	$s2,3,$acc15
264	and	$acc14,2040,$acc14
265	ldx	[$tbl+$acc13],$acc13
266	and	$acc15,2040,$acc15
267	add	$key,32,$key
268	ldx	[$tbl+$acc14],$acc14
269	fmovs	%f0,%f0
270	subcc	$rounds,1,$rounds		!
271	ldx	[$tbl+$acc15],$acc15
272	bz,a,pn	%icc,.Lenc_last
273	add	$tbl,2048,$rounds
274
275		srlx	$acc1,8,$acc1
276		xor	$acc0,$t0,$t0
277	ld	[$key+0],$s0
278	fmovs	%f0,%f0
279		srlx	$acc2,16,$acc2		!
280		xor	$acc1,$t0,$t0
281	ld	[$key+4],$s1
282		srlx	$acc3,24,$acc3
283		xor	$acc2,$t0,$t0
284	ld	[$key+8],$s2
285		srlx	$acc5,8,$acc5
286		xor	$acc3,$t0,$t0
287	ld	[$key+12],$s3			!
288		srlx	$acc6,16,$acc6
289		xor	$acc4,$t1,$t1
290	fmovs	%f0,%f0
291		srlx	$acc7,24,$acc7
292		xor	$acc5,$t1,$t1
293		srlx	$acc9,8,$acc9
294		xor	$acc6,$t1,$t1
295		srlx	$acc10,16,$acc10	!
296		xor	$acc7,$t1,$t1
297		srlx	$acc11,24,$acc11
298		xor	$acc8,$t2,$t2
299		srlx	$acc13,8,$acc13
300		xor	$acc9,$t2,$t2
301		srlx	$acc14,16,$acc14
302		xor	$acc10,$t2,$t2
303		srlx	$acc15,24,$acc15	!
304		xor	$acc11,$t2,$t2
305		xor	$acc12,$acc14,$acc14
306		xor	$acc13,$t3,$t3
307	srl	$t0,21,$acc0
308		xor	$acc14,$t3,$t3
309	srl	$t1,13,$acc1
310		xor	$acc15,$t3,$t3
311
312	and	$acc0,2040,$acc0		!
313	srl	$t2,5,$acc2
314	and	$acc1,2040,$acc1
315	ldx	[$tbl+$acc0],$acc0
316	sll	$t3,3,$acc3
317	and	$acc2,2040,$acc2
318	ldx	[$tbl+$acc1],$acc1
319	fmovs	%f0,%f0
320	srl	$t1,21,$acc4			!
321	and	$acc3,2040,$acc3
322	ldx	[$tbl+$acc2],$acc2
323	srl	$t2,13,$acc5
324	and	$acc4,2040,$acc4
325	ldx	[$tbl+$acc3],$acc3
326	srl	$t3,5,$acc6
327	and	$acc5,2040,$acc5
328	ldx	[$tbl+$acc4],$acc4		!
329	sll	$t0,3,$acc7
330	and	$acc6,2040,$acc6
331	ldx	[$tbl+$acc5],$acc5
332	srl	$t2,21,$acc8
333	and	$acc7,2040,$acc7
334	ldx	[$tbl+$acc6],$acc6
335	fmovs	%f0,%f0
336	srl	$t3,13,$acc9			!
337	and	$acc8,2040,$acc8
338	ldx	[$tbl+$acc7],$acc7
339	srl	$t0,5,$acc10
340	and	$acc9,2040,$acc9
341	ldx	[$tbl+$acc8],$acc8
342	sll	$t1,3,$acc11
343	and	$acc10,2040,$acc10
344	ldx	[$tbl+$acc9],$acc9		!
345	srl	$t3,21,$acc12
346	and	$acc11,2040,$acc11
347	ldx	[$tbl+$acc10],$acc10
348	srl	$t0,13,$acc13
349	and	$acc12,2040,$acc12
350	ldx	[$tbl+$acc11],$acc11
351	fmovs	%f0,%f0
352	srl	$t1,5,$acc14			!
353	and	$acc13,2040,$acc13
354	ldx	[$tbl+$acc12],$acc12
355	sll	$t2,3,$acc15
356	and	$acc14,2040,$acc14
357	ldx	[$tbl+$acc13],$acc13
358		srlx	$acc1,8,$acc1
359	and	$acc15,2040,$acc15
360	ldx	[$tbl+$acc14],$acc14		!
361
362		srlx	$acc2,16,$acc2
363		xor	$acc0,$s0,$s0
364	ldx	[$tbl+$acc15],$acc15
365		srlx	$acc3,24,$acc3
366		xor	$acc1,$s0,$s0
367	ld	[$key+16],$t0
368	fmovs	%f0,%f0
369		srlx	$acc5,8,$acc5		!
370		xor	$acc2,$s0,$s0
371	ld	[$key+20],$t1
372		srlx	$acc6,16,$acc6
373		xor	$acc3,$s0,$s0
374	ld	[$key+24],$t2
375		srlx	$acc7,24,$acc7
376		xor	$acc4,$s1,$s1
377	ld	[$key+28],$t3			!
378		srlx	$acc9,8,$acc9
379		xor	$acc5,$s1,$s1
380	ldx	[$tbl+2048+0],%g0		! prefetch te4
381		srlx	$acc10,16,$acc10
382		xor	$acc6,$s1,$s1
383	ldx	[$tbl+2048+32],%g0		! prefetch te4
384		srlx	$acc11,24,$acc11
385		xor	$acc7,$s1,$s1
386	ldx	[$tbl+2048+64],%g0		! prefetch te4
387		srlx	$acc13,8,$acc13
388		xor	$acc8,$s2,$s2
389	ldx	[$tbl+2048+96],%g0		! prefetch te4
390		srlx	$acc14,16,$acc14	!
391		xor	$acc9,$s2,$s2
392	ldx	[$tbl+2048+128],%g0		! prefetch te4
393		srlx	$acc15,24,$acc15
394		xor	$acc10,$s2,$s2
395	ldx	[$tbl+2048+160],%g0		! prefetch te4
396	srl	$s0,21,$acc0
397		xor	$acc11,$s2,$s2
398	ldx	[$tbl+2048+192],%g0		! prefetch te4
399		xor	$acc12,$acc14,$acc14
400		xor	$acc13,$s3,$s3
401	ldx	[$tbl+2048+224],%g0		! prefetch te4
402	srl	$s1,13,$acc1			!
403		xor	$acc14,$s3,$s3
404		xor	$acc15,$s3,$s3
405	ba	.Lenc_loop
406	and	$acc0,2040,$acc0
407
408.align	32
409.Lenc_last:
410		srlx	$acc1,8,$acc1		!
411		xor	$acc0,$t0,$t0
412	ld	[$key+0],$s0
413		srlx	$acc2,16,$acc2
414		xor	$acc1,$t0,$t0
415	ld	[$key+4],$s1
416		srlx	$acc3,24,$acc3
417		xor	$acc2,$t0,$t0
418	ld	[$key+8],$s2			!
419		srlx	$acc5,8,$acc5
420		xor	$acc3,$t0,$t0
421	ld	[$key+12],$s3
422		srlx	$acc6,16,$acc6
423		xor	$acc4,$t1,$t1
424		srlx	$acc7,24,$acc7
425		xor	$acc5,$t1,$t1
426		srlx	$acc9,8,$acc9		!
427		xor	$acc6,$t1,$t1
428		srlx	$acc10,16,$acc10
429		xor	$acc7,$t1,$t1
430		srlx	$acc11,24,$acc11
431		xor	$acc8,$t2,$t2
432		srlx	$acc13,8,$acc13
433		xor	$acc9,$t2,$t2
434		srlx	$acc14,16,$acc14	!
435		xor	$acc10,$t2,$t2
436		srlx	$acc15,24,$acc15
437		xor	$acc11,$t2,$t2
438		xor	$acc12,$acc14,$acc14
439		xor	$acc13,$t3,$t3
440	srl	$t0,24,$acc0
441		xor	$acc14,$t3,$t3
442	srl	$t1,16,$acc1			!
443		xor	$acc15,$t3,$t3
444
445	srl	$t2,8,$acc2
446	and	$acc1,255,$acc1
447	ldub	[$rounds+$acc0],$acc0
448	srl	$t1,24,$acc4
449	and	$acc2,255,$acc2
450	ldub	[$rounds+$acc1],$acc1
451	srl	$t2,16,$acc5			!
452	and	$t3,255,$acc3
453	ldub	[$rounds+$acc2],$acc2
454	ldub	[$rounds+$acc3],$acc3
455	srl	$t3,8,$acc6
456	and	$acc5,255,$acc5
457	ldub	[$rounds+$acc4],$acc4
458	fmovs	%f0,%f0
459	srl	$t2,24,$acc8			!
460	and	$acc6,255,$acc6
461	ldub	[$rounds+$acc5],$acc5
462	srl	$t3,16,$acc9
463	and	$t0,255,$acc7
464	ldub	[$rounds+$acc6],$acc6
465	ldub	[$rounds+$acc7],$acc7
466	fmovs	%f0,%f0
467	srl	$t0,8,$acc10			!
468	and	$acc9,255,$acc9
469	ldub	[$rounds+$acc8],$acc8
470	srl	$t3,24,$acc12
471	and	$acc10,255,$acc10
472	ldub	[$rounds+$acc9],$acc9
473	srl	$t0,16,$acc13
474	and	$t1,255,$acc11
475	ldub	[$rounds+$acc10],$acc10		!
476	srl	$t1,8,$acc14
477	and	$acc13,255,$acc13
478	ldub	[$rounds+$acc11],$acc11
479	ldub	[$rounds+$acc12],$acc12
480	and	$acc14,255,$acc14
481	ldub	[$rounds+$acc13],$acc13
482	and	$t2,255,$acc15
483	ldub	[$rounds+$acc14],$acc14		!
484
485		sll	$acc0,24,$acc0
486		xor	$acc3,$s0,$s0
487	ldub	[$rounds+$acc15],$acc15
488		sll	$acc1,16,$acc1
489		xor	$acc0,$s0,$s0
490	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
491	fmovs	%f0,%f0
492		sll	$acc2,8,$acc2		!
493		xor	$acc1,$s0,$s0
494		sll	$acc4,24,$acc4
495		xor	$acc2,$s0,$s0
496		sll	$acc5,16,$acc5
497		xor	$acc7,$s1,$s1
498		sll	$acc6,8,$acc6
499		xor	$acc4,$s1,$s1
500		sll	$acc8,24,$acc8		!
501		xor	$acc5,$s1,$s1
502		sll	$acc9,16,$acc9
503		xor	$acc11,$s2,$s2
504		sll	$acc10,8,$acc10
505		xor	$acc6,$s1,$s1
506		sll	$acc12,24,$acc12
507		xor	$acc8,$s2,$s2
508		sll	$acc13,16,$acc13	!
509		xor	$acc9,$s2,$s2
510		sll	$acc14,8,$acc14
511		xor	$acc10,$s2,$s2
512		xor	$acc12,$acc14,$acc14
513		xor	$acc13,$s3,$s3
514		xor	$acc14,$s3,$s3
515		xor	$acc15,$s3,$s3
516
517	ret
518	restore
519.type	_sparcv9_AES_encrypt,#function
520.size	_sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
521
522.align	32
523.globl	AES_encrypt
524AES_encrypt:
525	or	%o0,%o1,%g1
526	andcc	%g1,3,%g0
527	bnz,pn	%xcc,.Lunaligned_enc
528	save	%sp,-$frame,%sp
529
530	ld	[%i0+0],%o0
531	ld	[%i0+4],%o1
532	ld	[%i0+8],%o2
533	ld	[%i0+12],%o3
534
5351:	call	.+8
536	add	%o7,AES_Te-1b,%o4
537	call	_sparcv9_AES_encrypt
538	mov	%i2,%o5
539
540	st	%o0,[%i1+0]
541	st	%o1,[%i1+4]
542	st	%o2,[%i1+8]
543	st	%o3,[%i1+12]
544
545	ret
546	restore
547
548.align	32
549.Lunaligned_enc:
550	ldub	[%i0+0],%l0
551	ldub	[%i0+1],%l1
552	ldub	[%i0+2],%l2
553
554	sll	%l0,24,%l0
555	ldub	[%i0+3],%l3
556	sll	%l1,16,%l1
557	ldub	[%i0+4],%l4
558	sll	%l2,8,%l2
559	or	%l1,%l0,%l0
560	ldub	[%i0+5],%l5
561	sll	%l4,24,%l4
562	or	%l3,%l2,%l2
563	ldub	[%i0+6],%l6
564	sll	%l5,16,%l5
565	or	%l0,%l2,%o0
566	ldub	[%i0+7],%l7
567
568	sll	%l6,8,%l6
569	or	%l5,%l4,%l4
570	ldub	[%i0+8],%l0
571	or	%l7,%l6,%l6
572	ldub	[%i0+9],%l1
573	or	%l4,%l6,%o1
574	ldub	[%i0+10],%l2
575
576	sll	%l0,24,%l0
577	ldub	[%i0+11],%l3
578	sll	%l1,16,%l1
579	ldub	[%i0+12],%l4
580	sll	%l2,8,%l2
581	or	%l1,%l0,%l0
582	ldub	[%i0+13],%l5
583	sll	%l4,24,%l4
584	or	%l3,%l2,%l2
585	ldub	[%i0+14],%l6
586	sll	%l5,16,%l5
587	or	%l0,%l2,%o2
588	ldub	[%i0+15],%l7
589
590	sll	%l6,8,%l6
591	or	%l5,%l4,%l4
592	or	%l7,%l6,%l6
593	or	%l4,%l6,%o3
594
5951:	call	.+8
596	add	%o7,AES_Te-1b,%o4
597	call	_sparcv9_AES_encrypt
598	mov	%i2,%o5
599
600	srl	%o0,24,%l0
601	srl	%o0,16,%l1
602	stb	%l0,[%i1+0]
603	srl	%o0,8,%l2
604	stb	%l1,[%i1+1]
605	stb	%l2,[%i1+2]
606	srl	%o1,24,%l4
607	stb	%o0,[%i1+3]
608
609	srl	%o1,16,%l5
610	stb	%l4,[%i1+4]
611	srl	%o1,8,%l6
612	stb	%l5,[%i1+5]
613	stb	%l6,[%i1+6]
614	srl	%o2,24,%l0
615	stb	%o1,[%i1+7]
616
617	srl	%o2,16,%l1
618	stb	%l0,[%i1+8]
619	srl	%o2,8,%l2
620	stb	%l1,[%i1+9]
621	stb	%l2,[%i1+10]
622	srl	%o3,24,%l4
623	stb	%o2,[%i1+11]
624
625	srl	%o3,16,%l5
626	stb	%l4,[%i1+12]
627	srl	%o3,8,%l6
628	stb	%l5,[%i1+13]
629	stb	%l6,[%i1+14]
630	stb	%o3,[%i1+15]
631
632	ret
633	restore
634.type	AES_encrypt,#function
635.size	AES_encrypt,(.-AES_encrypt)
636
637___
638
639$code.=<<___;
640.align	256
641AES_Td:
642___
643&_data_word(
644	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
645	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
646	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
647	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
648	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
649	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
650	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
651	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
652	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
653	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
654	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
655	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
656	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
657	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
658	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
659	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
660	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
661	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
662	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
663	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
664	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
665	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
666	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
667	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
668	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
669	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
670	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
671	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
672	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
673	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
674	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
675	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
676	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
677	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
678	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
679	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
680	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
681	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
682	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
683	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
684	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
685	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
686	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
687	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
688	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
689	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
690	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
691	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
692	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
693	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
694	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
695	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
696	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
697	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
698	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
699	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
700	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
701	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
702	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
703	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
704	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
705	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
706	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
707	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
708$code.=<<___;
709	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
710	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
711	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
712	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
713	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
714	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
715	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
716	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
717	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
718	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
719	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
720	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
721	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
722	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
723	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
724	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
725	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
726	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
727	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
728	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
729	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
730	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
731	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
732	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
733	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
734	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
735	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
736	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
737	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
738	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
739	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
740	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
741.type	AES_Td,#object
742.size	AES_Td,(.-AES_Td)
743
744.align	64
745.skip	16
746_sparcv9_AES_decrypt:
747	save	%sp,-$frame-$locals,%sp
748	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
749	ld	[$key+240],$rounds
750	ld	[$key+0],$t0
751	ld	[$key+4],$t1			!
752	ld	[$key+8],$t2
753	ld	[$key+12],$t3
754	srl	$rounds,1,$rounds
755	xor	$t0,$s0,$s0
756	ld	[$key+16],$t0
757	xor	$t1,$s1,$s1
758	ld	[$key+20],$t1
759	srl	$s0,21,$acc0			!
760	xor	$t2,$s2,$s2
761	ld	[$key+24],$t2
762	xor	$t3,$s3,$s3
763	and	$acc0,2040,$acc0
764	ld	[$key+28],$t3
765	srl	$s3,13,$acc1
766	nop
767.Ldec_loop:
768	srl	$s2,5,$acc2			!
769	and	$acc1,2040,$acc1
770	ldx	[$tbl+$acc0],$acc0
771	sll	$s1,3,$acc3
772	and	$acc2,2040,$acc2
773	ldx	[$tbl+$acc1],$acc1
774	srl	$s1,21,$acc4
775	and	$acc3,2040,$acc3
776	ldx	[$tbl+$acc2],$acc2		!
777	srl	$s0,13,$acc5
778	and	$acc4,2040,$acc4
779	ldx	[$tbl+$acc3],$acc3
780	srl	$s3,5,$acc6
781	and	$acc5,2040,$acc5
782	ldx	[$tbl+$acc4],$acc4
783	fmovs	%f0,%f0
784	sll	$s2,3,$acc7			!
785	and	$acc6,2040,$acc6
786	ldx	[$tbl+$acc5],$acc5
787	srl	$s2,21,$acc8
788	and	$acc7,2040,$acc7
789	ldx	[$tbl+$acc6],$acc6
790	srl	$s1,13,$acc9
791	and	$acc8,2040,$acc8
792	ldx	[$tbl+$acc7],$acc7		!
793	srl	$s0,5,$acc10
794	and	$acc9,2040,$acc9
795	ldx	[$tbl+$acc8],$acc8
796	sll	$s3,3,$acc11
797	and	$acc10,2040,$acc10
798	ldx	[$tbl+$acc9],$acc9
799	fmovs	%f0,%f0
800	srl	$s3,21,$acc12			!
801	and	$acc11,2040,$acc11
802	ldx	[$tbl+$acc10],$acc10
803	srl	$s2,13,$acc13
804	and	$acc12,2040,$acc12
805	ldx	[$tbl+$acc11],$acc11
806	srl	$s1,5,$acc14
807	and	$acc13,2040,$acc13
808	ldx	[$tbl+$acc12],$acc12		!
809	sll	$s0,3,$acc15
810	and	$acc14,2040,$acc14
811	ldx	[$tbl+$acc13],$acc13
812	and	$acc15,2040,$acc15
813	add	$key,32,$key
814	ldx	[$tbl+$acc14],$acc14
815	fmovs	%f0,%f0
816	subcc	$rounds,1,$rounds		!
817	ldx	[$tbl+$acc15],$acc15
818	bz,a,pn	%icc,.Ldec_last
819	add	$tbl,2048,$rounds
820
821		srlx	$acc1,8,$acc1
822		xor	$acc0,$t0,$t0
823	ld	[$key+0],$s0
824	fmovs	%f0,%f0
825		srlx	$acc2,16,$acc2		!
826		xor	$acc1,$t0,$t0
827	ld	[$key+4],$s1
828		srlx	$acc3,24,$acc3
829		xor	$acc2,$t0,$t0
830	ld	[$key+8],$s2
831		srlx	$acc5,8,$acc5
832		xor	$acc3,$t0,$t0
833	ld	[$key+12],$s3			!
834		srlx	$acc6,16,$acc6
835		xor	$acc4,$t1,$t1
836	fmovs	%f0,%f0
837		srlx	$acc7,24,$acc7
838		xor	$acc5,$t1,$t1
839		srlx	$acc9,8,$acc9
840		xor	$acc6,$t1,$t1
841		srlx	$acc10,16,$acc10	!
842		xor	$acc7,$t1,$t1
843		srlx	$acc11,24,$acc11
844		xor	$acc8,$t2,$t2
845		srlx	$acc13,8,$acc13
846		xor	$acc9,$t2,$t2
847		srlx	$acc14,16,$acc14
848		xor	$acc10,$t2,$t2
849		srlx	$acc15,24,$acc15	!
850		xor	$acc11,$t2,$t2
851		xor	$acc12,$acc14,$acc14
852		xor	$acc13,$t3,$t3
853	srl	$t0,21,$acc0
854		xor	$acc14,$t3,$t3
855		xor	$acc15,$t3,$t3
856	srl	$t3,13,$acc1
857
858	and	$acc0,2040,$acc0		!
859	srl	$t2,5,$acc2
860	and	$acc1,2040,$acc1
861	ldx	[$tbl+$acc0],$acc0
862	sll	$t1,3,$acc3
863	and	$acc2,2040,$acc2
864	ldx	[$tbl+$acc1],$acc1
865	fmovs	%f0,%f0
866	srl	$t1,21,$acc4			!
867	and	$acc3,2040,$acc3
868	ldx	[$tbl+$acc2],$acc2
869	srl	$t0,13,$acc5
870	and	$acc4,2040,$acc4
871	ldx	[$tbl+$acc3],$acc3
872	srl	$t3,5,$acc6
873	and	$acc5,2040,$acc5
874	ldx	[$tbl+$acc4],$acc4		!
875	sll	$t2,3,$acc7
876	and	$acc6,2040,$acc6
877	ldx	[$tbl+$acc5],$acc5
878	srl	$t2,21,$acc8
879	and	$acc7,2040,$acc7
880	ldx	[$tbl+$acc6],$acc6
881	fmovs	%f0,%f0
882	srl	$t1,13,$acc9			!
883	and	$acc8,2040,$acc8
884	ldx	[$tbl+$acc7],$acc7
885	srl	$t0,5,$acc10
886	and	$acc9,2040,$acc9
887	ldx	[$tbl+$acc8],$acc8
888	sll	$t3,3,$acc11
889	and	$acc10,2040,$acc10
890	ldx	[$tbl+$acc9],$acc9		!
891	srl	$t3,21,$acc12
892	and	$acc11,2040,$acc11
893	ldx	[$tbl+$acc10],$acc10
894	srl	$t2,13,$acc13
895	and	$acc12,2040,$acc12
896	ldx	[$tbl+$acc11],$acc11
897	fmovs	%f0,%f0
898	srl	$t1,5,$acc14			!
899	and	$acc13,2040,$acc13
900	ldx	[$tbl+$acc12],$acc12
901	sll	$t0,3,$acc15
902	and	$acc14,2040,$acc14
903	ldx	[$tbl+$acc13],$acc13
904		srlx	$acc1,8,$acc1
905	and	$acc15,2040,$acc15
906	ldx	[$tbl+$acc14],$acc14		!
907
908		srlx	$acc2,16,$acc2
909		xor	$acc0,$s0,$s0
910	ldx	[$tbl+$acc15],$acc15
911		srlx	$acc3,24,$acc3
912		xor	$acc1,$s0,$s0
913	ld	[$key+16],$t0
914	fmovs	%f0,%f0
915		srlx	$acc5,8,$acc5		!
916		xor	$acc2,$s0,$s0
917	ld	[$key+20],$t1
918		srlx	$acc6,16,$acc6
919		xor	$acc3,$s0,$s0
920	ld	[$key+24],$t2
921		srlx	$acc7,24,$acc7
922		xor	$acc4,$s1,$s1
923	ld	[$key+28],$t3			!
924		srlx	$acc9,8,$acc9
925		xor	$acc5,$s1,$s1
926	ldx	[$tbl+2048+0],%g0		! prefetch td4
927		srlx	$acc10,16,$acc10
928		xor	$acc6,$s1,$s1
929	ldx	[$tbl+2048+32],%g0		! prefetch td4
930		srlx	$acc11,24,$acc11
931		xor	$acc7,$s1,$s1
932	ldx	[$tbl+2048+64],%g0		! prefetch td4
933		srlx	$acc13,8,$acc13
934		xor	$acc8,$s2,$s2
935	ldx	[$tbl+2048+96],%g0		! prefetch td4
936		srlx	$acc14,16,$acc14	!
937		xor	$acc9,$s2,$s2
938	ldx	[$tbl+2048+128],%g0		! prefetch td4
939		srlx	$acc15,24,$acc15
940		xor	$acc10,$s2,$s2
941	ldx	[$tbl+2048+160],%g0		! prefetch td4
942	srl	$s0,21,$acc0
943		xor	$acc11,$s2,$s2
944	ldx	[$tbl+2048+192],%g0		! prefetch td4
945		xor	$acc12,$acc14,$acc14
946		xor	$acc13,$s3,$s3
947	ldx	[$tbl+2048+224],%g0		! prefetch td4
948	and	$acc0,2040,$acc0		!
949		xor	$acc14,$s3,$s3
950		xor	$acc15,$s3,$s3
951	ba	.Ldec_loop
952	srl	$s3,13,$acc1
953
954.align	32
955.Ldec_last:
956		srlx	$acc1,8,$acc1		!
957		xor	$acc0,$t0,$t0
958	ld	[$key+0],$s0
959		srlx	$acc2,16,$acc2
960		xor	$acc1,$t0,$t0
961	ld	[$key+4],$s1
962		srlx	$acc3,24,$acc3
963		xor	$acc2,$t0,$t0
964	ld	[$key+8],$s2			!
965		srlx	$acc5,8,$acc5
966		xor	$acc3,$t0,$t0
967	ld	[$key+12],$s3
968		srlx	$acc6,16,$acc6
969		xor	$acc4,$t1,$t1
970		srlx	$acc7,24,$acc7
971		xor	$acc5,$t1,$t1
972		srlx	$acc9,8,$acc9		!
973		xor	$acc6,$t1,$t1
974		srlx	$acc10,16,$acc10
975		xor	$acc7,$t1,$t1
976		srlx	$acc11,24,$acc11
977		xor	$acc8,$t2,$t2
978		srlx	$acc13,8,$acc13
979		xor	$acc9,$t2,$t2
980		srlx	$acc14,16,$acc14	!
981		xor	$acc10,$t2,$t2
982		srlx	$acc15,24,$acc15
983		xor	$acc11,$t2,$t2
984		xor	$acc12,$acc14,$acc14
985		xor	$acc13,$t3,$t3
986	srl	$t0,24,$acc0
987		xor	$acc14,$t3,$t3
988		xor	$acc15,$t3,$t3		!
989	srl	$t3,16,$acc1
990
991	srl	$t2,8,$acc2
992	and	$acc1,255,$acc1
993	ldub	[$rounds+$acc0],$acc0
994	srl	$t1,24,$acc4
995	and	$acc2,255,$acc2
996	ldub	[$rounds+$acc1],$acc1
997	srl	$t0,16,$acc5			!
998	and	$t1,255,$acc3
999	ldub	[$rounds+$acc2],$acc2
1000	ldub	[$rounds+$acc3],$acc3
1001	srl	$t3,8,$acc6
1002	and	$acc5,255,$acc5
1003	ldub	[$rounds+$acc4],$acc4
1004	fmovs	%f0,%f0
1005	srl	$t2,24,$acc8			!
1006	and	$acc6,255,$acc6
1007	ldub	[$rounds+$acc5],$acc5
1008	srl	$t1,16,$acc9
1009	and	$t2,255,$acc7
1010	ldub	[$rounds+$acc6],$acc6
1011	ldub	[$rounds+$acc7],$acc7
1012	fmovs	%f0,%f0
1013	srl	$t0,8,$acc10			!
1014	and	$acc9,255,$acc9
1015	ldub	[$rounds+$acc8],$acc8
1016	srl	$t3,24,$acc12
1017	and	$acc10,255,$acc10
1018	ldub	[$rounds+$acc9],$acc9
1019	srl	$t2,16,$acc13
1020	and	$t3,255,$acc11
1021	ldub	[$rounds+$acc10],$acc10		!
1022	srl	$t1,8,$acc14
1023	and	$acc13,255,$acc13
1024	ldub	[$rounds+$acc11],$acc11
1025	ldub	[$rounds+$acc12],$acc12
1026	and	$acc14,255,$acc14
1027	ldub	[$rounds+$acc13],$acc13
1028	and	$t0,255,$acc15
1029	ldub	[$rounds+$acc14],$acc14		!
1030
1031		sll	$acc0,24,$acc0
1032		xor	$acc3,$s0,$s0
1033	ldub	[$rounds+$acc15],$acc15
1034		sll	$acc1,16,$acc1
1035		xor	$acc0,$s0,$s0
1036	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
1037	fmovs	%f0,%f0
1038		sll	$acc2,8,$acc2		!
1039		xor	$acc1,$s0,$s0
1040		sll	$acc4,24,$acc4
1041		xor	$acc2,$s0,$s0
1042		sll	$acc5,16,$acc5
1043		xor	$acc7,$s1,$s1
1044		sll	$acc6,8,$acc6
1045		xor	$acc4,$s1,$s1
1046		sll	$acc8,24,$acc8		!
1047		xor	$acc5,$s1,$s1
1048		sll	$acc9,16,$acc9
1049		xor	$acc11,$s2,$s2
1050		sll	$acc10,8,$acc10
1051		xor	$acc6,$s1,$s1
1052		sll	$acc12,24,$acc12
1053		xor	$acc8,$s2,$s2
1054		sll	$acc13,16,$acc13	!
1055		xor	$acc9,$s2,$s2
1056		sll	$acc14,8,$acc14
1057		xor	$acc10,$s2,$s2
1058		xor	$acc12,$acc14,$acc14
1059		xor	$acc13,$s3,$s3
1060		xor	$acc14,$s3,$s3
1061		xor	$acc15,$s3,$s3
1062
1063	ret
1064	restore
1065.type	_sparcv9_AES_decrypt,#function
1066.size	_sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1067
1068.align	32
1069.globl	AES_decrypt
1070AES_decrypt:
1071	or	%o0,%o1,%g1
1072	andcc	%g1,3,%g0
1073	bnz,pn	%xcc,.Lunaligned_dec
1074	save	%sp,-$frame,%sp
1075
1076	ld	[%i0+0],%o0
1077	ld	[%i0+4],%o1
1078	ld	[%i0+8],%o2
1079	ld	[%i0+12],%o3
1080
10811:	call	.+8
1082	add	%o7,AES_Td-1b,%o4
1083	call	_sparcv9_AES_decrypt
1084	mov	%i2,%o5
1085
1086	st	%o0,[%i1+0]
1087	st	%o1,[%i1+4]
1088	st	%o2,[%i1+8]
1089	st	%o3,[%i1+12]
1090
1091	ret
1092	restore
1093
1094.align	32
1095.Lunaligned_dec:
1096	ldub	[%i0+0],%l0
1097	ldub	[%i0+1],%l1
1098	ldub	[%i0+2],%l2
1099
1100	sll	%l0,24,%l0
1101	ldub	[%i0+3],%l3
1102	sll	%l1,16,%l1
1103	ldub	[%i0+4],%l4
1104	sll	%l2,8,%l2
1105	or	%l1,%l0,%l0
1106	ldub	[%i0+5],%l5
1107	sll	%l4,24,%l4
1108	or	%l3,%l2,%l2
1109	ldub	[%i0+6],%l6
1110	sll	%l5,16,%l5
1111	or	%l0,%l2,%o0
1112	ldub	[%i0+7],%l7
1113
1114	sll	%l6,8,%l6
1115	or	%l5,%l4,%l4
1116	ldub	[%i0+8],%l0
1117	or	%l7,%l6,%l6
1118	ldub	[%i0+9],%l1
1119	or	%l4,%l6,%o1
1120	ldub	[%i0+10],%l2
1121
1122	sll	%l0,24,%l0
1123	ldub	[%i0+11],%l3
1124	sll	%l1,16,%l1
1125	ldub	[%i0+12],%l4
1126	sll	%l2,8,%l2
1127	or	%l1,%l0,%l0
1128	ldub	[%i0+13],%l5
1129	sll	%l4,24,%l4
1130	or	%l3,%l2,%l2
1131	ldub	[%i0+14],%l6
1132	sll	%l5,16,%l5
1133	or	%l0,%l2,%o2
1134	ldub	[%i0+15],%l7
1135
1136	sll	%l6,8,%l6
1137	or	%l5,%l4,%l4
1138	or	%l7,%l6,%l6
1139	or	%l4,%l6,%o3
1140
11411:	call	.+8
1142	add	%o7,AES_Td-1b,%o4
1143	call	_sparcv9_AES_decrypt
1144	mov	%i2,%o5
1145
1146	srl	%o0,24,%l0
1147	srl	%o0,16,%l1
1148	stb	%l0,[%i1+0]
1149	srl	%o0,8,%l2
1150	stb	%l1,[%i1+1]
1151	stb	%l2,[%i1+2]
1152	srl	%o1,24,%l4
1153	stb	%o0,[%i1+3]
1154
1155	srl	%o1,16,%l5
1156	stb	%l4,[%i1+4]
1157	srl	%o1,8,%l6
1158	stb	%l5,[%i1+5]
1159	stb	%l6,[%i1+6]
1160	srl	%o2,24,%l0
1161	stb	%o1,[%i1+7]
1162
1163	srl	%o2,16,%l1
1164	stb	%l0,[%i1+8]
1165	srl	%o2,8,%l2
1166	stb	%l1,[%i1+9]
1167	stb	%l2,[%i1+10]
1168	srl	%o3,24,%l4
1169	stb	%o2,[%i1+11]
1170
1171	srl	%o3,16,%l5
1172	stb	%l4,[%i1+12]
1173	srl	%o3,8,%l6
1174	stb	%l5,[%i1+13]
1175	stb	%l6,[%i1+14]
1176	stb	%o3,[%i1+15]
1177
1178	ret
1179	restore
1180.type	AES_decrypt,#function
1181.size	AES_decrypt,(.-AES_decrypt)
1182___
1183
1184# fmovs instructions substituting for FP nops were originally added
1185# to meet specific instruction alignment requirements to maximize ILP.
1186# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1187# undesired effect, so just omit them and sacrifice some portion of
1188# percent in performance...
1189$code =~ s/fmovs.*$//gm;
1190
1191print $code;
1192close STDOUT;	# ensure flush
1193