1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2016
18#
19# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
20# required key setup and single-block procedures.
21#
22# April 2016
23#
24# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
25# that parallelizable nature of CBC decrypt and CTR is not utilized
26# yet. CBC encrypt on the other hand is as good as it can possibly
27# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
28# This is ~6x faster than pure software implementation...
29#
30# July 2016
31#
32# Switch from faligndata to fshiftorx, which allows to omit alignaddr
33# instructions and improve single-block and short-input performance
34# with misaligned data.
35
36$output = pop;
37open STDOUT,">$output";
38
39{
40my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
41
42$code.=<<___;
43#include "sparc_arch.h"
44
45#define LOCALS (STACK_BIAS+STACK_FRAME)
46
47.text
48
49.globl	aes_fx_encrypt
50.align	32
51aes_fx_encrypt:
52	and		$inp, 7, $tmp		! is input aligned?
53	andn		$inp, 7, $inp
54	ldd		[$key +  0], %f6	! round[0]
55	ldd		[$key +  8], %f8
56	mov		%o7, %g1
57	ld		[$key + 240], $rounds
58
591:	call		.+8
60	add		%o7, .Linp_align-1b, %o7
61
62	sll		$tmp, 3, $tmp
63	ldd		[$inp + 0], %f0		! load input
64	brz,pt		$tmp, .Lenc_inp_aligned
65	ldd		[$inp + 8], %f2
66
67	ldd		[%o7 + $tmp], %f14	! shift left params
68	ldd		[$inp + 16], %f4
69	fshiftorx	%f0, %f2, %f14, %f0
70	fshiftorx	%f2, %f4, %f14, %f2
71
72.Lenc_inp_aligned:
73	ldd		[$key + 16], %f10	! round[1]
74	ldd		[$key + 24], %f12
75
76	fxor		%f0, %f6, %f0		! ^=round[0]
77	fxor		%f2, %f8, %f2
78	ldd		[$key + 32], %f6	! round[2]
79	ldd		[$key + 40], %f8
80	add		$key, 32, $key
81	sub		$rounds, 4, $rounds
82
83.Loop_enc:
84	fmovd		%f0, %f4
85	faesencx	%f2, %f10, %f0
86	faesencx	%f4, %f12, %f2
87	ldd		[$key + 16], %f10
88	ldd		[$key + 24], %f12
89	add		$key, 32, $key
90
91	fmovd		%f0, %f4
92	faesencx	%f2, %f6, %f0
93	faesencx	%f4, %f8, %f2
94	ldd		[$key +  0], %f6
95	ldd		[$key +  8], %f8
96
97	brnz,a		$rounds, .Loop_enc
98	sub		$rounds, 2, $rounds
99
100	andcc		$out, 7, $tmp		! is output aligned?
101	andn		$out, 7, $out
102	mov		0xff, $mask
103	srl		$mask, $tmp, $mask
104	add		%o7, 64, %o7
105	sll		$tmp, 3, $tmp
106
107	fmovd		%f0, %f4
108	faesencx	%f2, %f10, %f0
109	faesencx	%f4, %f12, %f2
110	ldd		[%o7 + $tmp], %f14	! shift right params
111
112	fmovd		%f0, %f4
113	faesenclx	%f2, %f6, %f0
114	faesenclx	%f4, %f8, %f2
115
116	bnz,pn		%icc, .Lenc_out_unaligned
117	mov		%g1, %o7
118
119	std		%f0, [$out + 0]
120	retl
121	std		%f2, [$out + 8]
122
123.align	16
124.Lenc_out_unaligned:
125	add		$out, 16, $inp
126	orn		%g0, $mask, $tmp
127	fshiftorx	%f0, %f0, %f14, %f4
128	fshiftorx	%f0, %f2, %f14, %f6
129	fshiftorx	%f2, %f2, %f14, %f8
130
131	stda		%f4, [$out + $mask]0xc0	! partial store
132	std		%f6, [$out + 8]
133	stda		%f8, [$inp + $tmp]0xc0	! partial store
134	retl
135	nop
136.type	aes_fx_encrypt,#function
137.size	aes_fx_encrypt,.-aes_fx_encrypt
138
139.globl	aes_fx_decrypt
140.align	32
141aes_fx_decrypt:
142	and		$inp, 7, $tmp		! is input aligned?
143	andn		$inp, 7, $inp
144	ldd		[$key +  0], %f6	! round[0]
145	ldd		[$key +  8], %f8
146	mov		%o7, %g1
147	ld		[$key + 240], $rounds
148
1491:	call		.+8
150	add		%o7, .Linp_align-1b, %o7
151
152	sll		$tmp, 3, $tmp
153	ldd		[$inp + 0], %f0		! load input
154	brz,pt		$tmp, .Ldec_inp_aligned
155	ldd		[$inp + 8], %f2
156
157	ldd		[%o7 + $tmp], %f14	! shift left params
158	ldd		[$inp + 16], %f4
159	fshiftorx	%f0, %f2, %f14, %f0
160	fshiftorx	%f2, %f4, %f14, %f2
161
162.Ldec_inp_aligned:
163	ldd		[$key + 16], %f10	! round[1]
164	ldd		[$key + 24], %f12
165
166	fxor		%f0, %f6, %f0		! ^=round[0]
167	fxor		%f2, %f8, %f2
168	ldd		[$key + 32], %f6	! round[2]
169	ldd		[$key + 40], %f8
170	add		$key, 32, $key
171	sub		$rounds, 4, $rounds
172
173.Loop_dec:
174	fmovd		%f0, %f4
175	faesdecx	%f2, %f10, %f0
176	faesdecx	%f4, %f12, %f2
177	ldd		[$key + 16], %f10
178	ldd		[$key + 24], %f12
179	add		$key, 32, $key
180
181	fmovd		%f0, %f4
182	faesdecx	%f2, %f6, %f0
183	faesdecx	%f4, %f8, %f2
184	ldd		[$key +  0], %f6
185	ldd		[$key +  8], %f8
186
187	brnz,a		$rounds, .Loop_dec
188	sub		$rounds, 2, $rounds
189
190	andcc		$out, 7, $tmp		! is output aligned?
191	andn		$out, 7, $out
192	mov		0xff, $mask
193	srl		$mask, $tmp, $mask
194	add		%o7, 64, %o7
195	sll		$tmp, 3, $tmp
196
197	fmovd		%f0, %f4
198	faesdecx	%f2, %f10, %f0
199	faesdecx	%f4, %f12, %f2
200	ldd		[%o7 + $tmp], %f14	! shift right params
201
202	fmovd		%f0, %f4
203	faesdeclx	%f2, %f6, %f0
204	faesdeclx	%f4, %f8, %f2
205
206	bnz,pn		%icc, .Ldec_out_unaligned
207	mov		%g1, %o7
208
209	std		%f0, [$out + 0]
210	retl
211	std		%f2, [$out + 8]
212
213.align	16
214.Ldec_out_unaligned:
215	add		$out, 16, $inp
216	orn		%g0, $mask, $tmp
217	fshiftorx	%f0, %f0, %f14, %f4
218	fshiftorx	%f0, %f2, %f14, %f6
219	fshiftorx	%f2, %f2, %f14, %f8
220
221	stda		%f4, [$out + $mask]0xc0	! partial store
222	std		%f6, [$out + 8]
223	stda		%f8, [$inp + $tmp]0xc0	! partial store
224	retl
225	nop
226.type	aes_fx_decrypt,#function
227.size	aes_fx_decrypt,.-aes_fx_decrypt
228___
229}
230{
231my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
232$code.=<<___;
233.globl	aes_fx_set_decrypt_key
234.align	32
235aes_fx_set_decrypt_key:
236	b		.Lset_encrypt_key
237	mov		-1, $inc
238	retl
239	nop
240.type	aes_fx_set_decrypt_key,#function
241.size	aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
242
243.globl	aes_fx_set_encrypt_key
244.align	32
245aes_fx_set_encrypt_key:
246	mov		1, $inc
247	nop
248.Lset_encrypt_key:
249	and		$inp, 7, $tmp
250	andn		$inp, 7, $inp
251	sll		$tmp, 3, $tmp
252	mov		%o7, %g1
253
2541:	call		.+8
255	add		%o7, .Linp_align-1b, %o7
256
257	ldd		[%o7 + $tmp], %f10	! shift left params
258	mov		%g1, %o7
259
260	cmp		$bits, 192
261	ldd		[$inp + 0], %f0
262	bl,pt		%icc, .L128
263	ldd		[$inp + 8], %f2
264
265	be,pt		%icc, .L192
266	ldd		[$inp + 16], %f4
267	brz,pt		$tmp, .L256aligned
268	ldd		[$inp + 24], %f6
269
270	ldd		[$inp + 32], %f8
271	fshiftorx	%f0, %f2, %f10, %f0
272	fshiftorx	%f2, %f4, %f10, %f2
273	fshiftorx	%f4, %f6, %f10, %f4
274	fshiftorx	%f6, %f8, %f10, %f6
275
276.L256aligned:
277	mov		14, $bits
278	and		$inc, `14*16`, $tmp
279	st		$bits, [$out + 240]	! store rounds
280	add		$out, $tmp, $out	! start or end of key schedule
281	sllx		$inc, 4, $inc		! 16 or -16
282___
283for ($i=0; $i<6; $i++) {
284    $code.=<<___;
285	std		%f0, [$out + 0]
286	faeskeyx	%f6, `0x10+$i`, %f0
287	std		%f2, [$out + 8]
288	add		$out, $inc, $out
289	faeskeyx	%f0, 0x00, %f2
290	std		%f4, [$out + 0]
291	faeskeyx	%f2, 0x01, %f4
292	std		%f6, [$out + 8]
293	add		$out, $inc, $out
294	faeskeyx	%f4, 0x00, %f6
295___
296}
297$code.=<<___;
298	std		%f0, [$out + 0]
299	faeskeyx	%f6, `0x10+$i`, %f0
300	std		%f2, [$out + 8]
301	add		$out, $inc, $out
302	faeskeyx	%f0, 0x00, %f2
303	std		%f4,[$out + 0]
304	std		%f6,[$out + 8]
305	add		$out, $inc, $out
306	std		%f0,[$out + 0]
307	std		%f2,[$out + 8]
308	retl
309	xor		%o0, %o0, %o0		! return 0
310
311.align	16
312.L192:
313	brz,pt		$tmp, .L192aligned
314	nop
315
316	ldd		[$inp + 24], %f6
317	fshiftorx	%f0, %f2, %f10, %f0
318	fshiftorx	%f2, %f4, %f10, %f2
319	fshiftorx	%f4, %f6, %f10, %f4
320
321.L192aligned:
322	mov		12, $bits
323	and		$inc, `12*16`, $tmp
324	st		$bits, [$out + 240]	! store rounds
325	add		$out, $tmp, $out	! start or end of key schedule
326	sllx		$inc, 4, $inc		! 16 or -16
327___
328for ($i=0; $i<8; $i+=2) {
329    $code.=<<___;
330	std		%f0, [$out + 0]
331	faeskeyx	%f4, `0x10+$i`, %f0
332	std		%f2, [$out + 8]
333	add		$out, $inc, $out
334	faeskeyx	%f0, 0x00, %f2
335	std		%f4, [$out + 0]
336	faeskeyx	%f2, 0x00, %f4
337	std		%f0, [$out + 8]
338	add		$out, $inc, $out
339	faeskeyx	%f4, `0x10+$i+1`, %f0
340	std		%f2, [$out + 0]
341	faeskeyx	%f0, 0x00, %f2
342	std		%f4, [$out + 8]
343	add		$out, $inc, $out
344___
345$code.=<<___		if ($i<6);
346	faeskeyx	%f2, 0x00, %f4
347___
348}
349$code.=<<___;
350	std		%f0, [$out + 0]
351	std		%f2, [$out + 8]
352	retl
353	xor		%o0, %o0, %o0		! return 0
354
355.align	16
356.L128:
357	brz,pt		$tmp, .L128aligned
358	nop
359
360	ldd		[$inp + 16], %f4
361	fshiftorx	%f0, %f2, %f10, %f0
362	fshiftorx	%f2, %f4, %f10, %f2
363
364.L128aligned:
365	mov		10, $bits
366	and		$inc, `10*16`, $tmp
367	st		$bits, [$out + 240]	! store rounds
368	add		$out, $tmp, $out	! start or end of key schedule
369	sllx		$inc, 4, $inc		! 16 or -16
370___
371for ($i=0; $i<10; $i++) {
372    $code.=<<___;
373	std		%f0, [$out + 0]
374	faeskeyx	%f2, `0x10+$i`, %f0
375	std		%f2, [$out + 8]
376	add		$out, $inc, $out
377	faeskeyx	%f0, 0x00, %f2
378___
379}
380$code.=<<___;
381	std		%f0, [$out + 0]
382	std		%f2, [$out + 8]
383	retl
384	xor		%o0, %o0, %o0		! return 0
385.type	aes_fx_set_encrypt_key,#function
386.size	aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
387___
388}
389{
390my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
391my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
392my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
393   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
394my ($ileft,$iright) = ($ialign,$oalign);
395
396$code.=<<___;
397.globl	aes_fx_cbc_encrypt
398.align	32
399aes_fx_cbc_encrypt:
400	save		%sp, -STACK_FRAME-16, %sp
401	srln		$len, 4, $len
402	and		$inp, 7, $ialign
403	andn		$inp, 7, $inp
404	brz,pn		$len, .Lcbc_no_data
405	sll		$ialign, 3, $ileft
406
4071:	call		.+8
408	add		%o7, .Linp_align-1b, %o7
409
410	ld		[$key + 240], $rounds
411	and		$out, 7, $oalign
412	ld		[$ivp + 0], %f0		! load ivec
413	andn		$out, 7, $out
414	ld		[$ivp + 4], %f1
415	sll		$oalign, 3, $mask
416	ld		[$ivp + 8], %f2
417	ld		[$ivp + 12], %f3
418
419	sll		$rounds, 4, $rounds
420	add		$rounds, $key, $end
421	ldd		[$key + 0], $r0hi	! round[0]
422	ldd		[$key + 8], $r0lo
423
424	add		$inp, 16, $inp
425	sub		$len,  1, $len
426	ldd		[$end + 0], $rlhi	! round[last]
427	ldd		[$end + 8], $rllo
428
429	mov		16, $inc
430	movrz		$len, 0, $inc
431	ldd		[$key + 16], %f10	! round[1]
432	ldd		[$key + 24], %f12
433
434	ldd		[%o7 + $ileft], $fshift	! shift left params
435	add		%o7, 64, %o7
436	ldd		[$inp - 16], $in0	! load input
437	ldd		[$inp -  8], $in1
438	ldda		[$inp]0x82, $intail	! non-faulting load
439	brz		$dir, .Lcbc_decrypt
440	add		$inp, $inc, $inp	! inp+=16
441
442	fxor		$r0hi, %f0, %f0		! ivec^=round[0]
443	fxor		$r0lo, %f2, %f2
444	fshiftorx	$in0, $in1, $fshift, $in0
445	fshiftorx	$in1, $intail, $fshift, $in1
446	nop
447
448.Loop_cbc_enc:
449	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
450	fxor		$in1, %f2, %f2
451	ldd		[$key + 32], %f6	! round[2]
452	ldd		[$key + 40], %f8
453	add		$key, 32, $end
454	sub		$rounds, 16*6, $inner
455
456.Lcbc_enc:
457	fmovd		%f0, %f4
458	faesencx	%f2, %f10, %f0
459	faesencx	%f4, %f12, %f2
460	ldd		[$end + 16], %f10
461	ldd		[$end + 24], %f12
462	add		$end, 32, $end
463
464	fmovd		%f0, %f4
465	faesencx	%f2, %f6, %f0
466	faesencx	%f4, %f8, %f2
467	ldd		[$end + 0], %f6
468	ldd		[$end + 8], %f8
469
470	brnz,a		$inner, .Lcbc_enc
471	sub		$inner, 16*2, $inner
472
473	fmovd		%f0, %f4
474	faesencx	%f2, %f10, %f0
475	faesencx	%f4, %f12, %f2
476	ldd		[$end + 16], %f10	! round[last-1]
477	ldd		[$end + 24], %f12
478
479	movrz		$len, 0, $inc
480	fmovd		$intail, $in0
481	ldd		[$inp - 8], $in1	! load next input block
482	ldda		[$inp]0x82, $intail	! non-faulting load
483	add		$inp, $inc, $inp	! inp+=16
484
485	fmovd		%f0, %f4
486	faesencx	%f2, %f6, %f0
487	faesencx	%f4, %f8, %f2
488
489	fshiftorx	$in0, $in1, $fshift, $in0
490	fshiftorx	$in1, $intail, $fshift, $in1
491
492	fmovd		%f0, %f4
493	faesencx	%f2, %f10, %f0
494	faesencx	%f4, %f12, %f2
495	ldd		[$key + 16], %f10	! round[1]
496	ldd		[$key + 24], %f12
497
498	fxor		$r0hi, $in0, $in0	! inp^=round[0]
499	fxor		$r0lo, $in1, $in1
500
501	fmovd		%f0, %f4
502	faesenclx	%f2, $rlhi, %f0
503	faesenclx	%f4, $rllo, %f2
504
505	brnz,pn		$oalign, .Lcbc_enc_unaligned_out
506	nop
507
508	std		%f0, [$out + 0]
509	std		%f2, [$out + 8]
510	add		$out, 16, $out
511
512	brnz,a		$len, .Loop_cbc_enc
513	sub		$len, 1, $len
514
515	st		%f0, [$ivp + 0]		! output ivec
516	st		%f1, [$ivp + 4]
517	st		%f2, [$ivp + 8]
518	st		%f3, [$ivp + 12]
519
520.Lcbc_no_data:
521	ret
522	restore
523
524.align	32
525.Lcbc_enc_unaligned_out:
526	ldd		[%o7 + $mask], $fshift	! shift right params
527	mov		0xff, $mask
528	srl		$mask, $oalign, $mask
529	sub		%g0, $ileft, $iright
530
531	fshiftorx	%f0, %f0, $fshift, %f6
532	fshiftorx	%f0, %f2, $fshift, %f8
533
534	stda		%f6, [$out + $mask]0xc0	! partial store
535	orn		%g0, $mask, $mask
536	std		%f8, [$out + 8]
537	add		$out, 16, $out
538	brz		$len, .Lcbc_enc_unaligned_out_done
539	sub		$len, 1, $len
540	b		.Loop_cbc_enc_unaligned_out
541	nop
542
543.align	32
544.Loop_cbc_enc_unaligned_out:
545	fmovd		%f2, $outhead
546	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
547	fxor		$in1, %f2, %f2
548	ldd		[$key + 32], %f6	! round[2]
549	ldd		[$key + 40], %f8
550
551	fmovd		%f0, %f4
552	faesencx	%f2, %f10, %f0
553	faesencx	%f4, %f12, %f2
554	ldd		[$key + 48], %f10	! round[3]
555	ldd		[$key + 56], %f12
556
557	ldx		[$inp - 16], %o0
558	ldx		[$inp -  8], %o1
559	brz		$ileft, .Lcbc_enc_aligned_inp
560	movrz		$len, 0, $inc
561
562	ldx		[$inp], %o2
563	sllx		%o0, $ileft, %o0
564	srlx		%o1, $iright, %g1
565	sllx		%o1, $ileft, %o1
566	or		%g1, %o0, %o0
567	srlx		%o2, $iright, %o2
568	or		%o2, %o1, %o1
569
570.Lcbc_enc_aligned_inp:
571	fmovd		%f0, %f4
572	faesencx	%f2, %f6, %f0
573	faesencx	%f4, %f8, %f2
574	ldd		[$key + 64], %f6	! round[4]
575	ldd		[$key + 72], %f8
576	add		$key, 64, $end
577	sub		$rounds, 16*8, $inner
578
579	stx		%o0, [%sp + LOCALS + 0]
580	stx		%o1, [%sp + LOCALS + 8]
581	add		$inp, $inc, $inp	! inp+=16
582	nop
583
584.Lcbc_enc_unaligned:
585	fmovd		%f0, %f4
586	faesencx	%f2, %f10, %f0
587	faesencx	%f4, %f12, %f2
588	ldd		[$end + 16], %f10
589	ldd		[$end + 24], %f12
590	add		$end, 32, $end
591
592	fmovd		%f0, %f4
593	faesencx	%f2, %f6, %f0
594	faesencx	%f4, %f8, %f2
595	ldd		[$end + 0], %f6
596	ldd		[$end + 8], %f8
597
598	brnz,a		$inner, .Lcbc_enc_unaligned
599	sub		$inner, 16*2, $inner
600
601	fmovd		%f0, %f4
602	faesencx	%f2, %f10, %f0
603	faesencx	%f4, %f12, %f2
604	ldd		[$end + 16], %f10	! round[last-1]
605	ldd		[$end + 24], %f12
606
607	fmovd		%f0, %f4
608	faesencx	%f2, %f6, %f0
609	faesencx	%f4, %f8, %f2
610
611	ldd		[%sp + LOCALS + 0], $in0
612	ldd		[%sp + LOCALS + 8], $in1
613
614	fmovd		%f0, %f4
615	faesencx	%f2, %f10, %f0
616	faesencx	%f4, %f12, %f2
617	ldd		[$key + 16], %f10	! round[1]
618	ldd		[$key + 24], %f12
619
620	fxor		$r0hi, $in0, $in0	! inp^=round[0]
621	fxor		$r0lo, $in1, $in1
622
623	fmovd		%f0, %f4
624	faesenclx	%f2, $rlhi, %f0
625	faesenclx	%f4, $rllo, %f2
626
627	fshiftorx	$outhead, %f0, $fshift, %f6
628	fshiftorx	%f0, %f2, $fshift, %f8
629	std		%f6, [$out + 0]
630	std		%f8, [$out + 8]
631	add		$out, 16, $out
632
633	brnz,a		$len, .Loop_cbc_enc_unaligned_out
634	sub		$len, 1, $len
635
636.Lcbc_enc_unaligned_out_done:
637	fshiftorx	%f2, %f2, $fshift, %f8
638	stda		%f8, [$out + $mask]0xc0	! partial store
639
640	st		%f0, [$ivp + 0]		! output ivec
641	st		%f1, [$ivp + 4]
642	st		%f2, [$ivp + 8]
643	st		%f3, [$ivp + 12]
644
645	ret
646	restore
647
648.align	32
649.Lcbc_decrypt:
650	fshiftorx	$in0, $in1, $fshift, $in0
651	fshiftorx	$in1, $intail, $fshift, $in1
652	fmovd		%f0, $iv0
653	fmovd		%f2, $iv1
654
655.Loop_cbc_dec:
656	fxor		$in0, $r0hi, %f0	! inp^round[0]
657	fxor		$in1, $r0lo, %f2
658	ldd		[$key + 32], %f6	! round[2]
659	ldd		[$key + 40], %f8
660	add		$key, 32, $end
661	sub		$rounds, 16*6, $inner
662
663.Lcbc_dec:
664	fmovd		%f0, %f4
665	faesdecx	%f2, %f10, %f0
666	faesdecx	%f4, %f12, %f2
667	ldd		[$end + 16], %f10
668	ldd		[$end + 24], %f12
669	add		$end, 32, $end
670
671	fmovd		%f0, %f4
672	faesdecx	%f2, %f6, %f0
673	faesdecx	%f4, %f8, %f2
674	ldd		[$end + 0], %f6
675	ldd		[$end + 8], %f8
676
677	brnz,a		$inner, .Lcbc_dec
678	sub		$inner, 16*2, $inner
679
680	fmovd		%f0, %f4
681	faesdecx	%f2, %f10, %f0
682	faesdecx	%f4, %f12, %f2
683	ldd		[$end + 16], %f10	! round[last-1]
684	ldd		[$end + 24], %f12
685
686	fmovd		%f0, %f4
687	faesdecx	%f2, %f6, %f0
688	faesdecx	%f4, %f8, %f2
689	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
690	fxor		$iv1, $rllo, %f8
691	fmovd		$in0, $iv0
692	fmovd		$in1, $iv1
693
694	movrz		$len, 0, $inc
695	fmovd		$intail, $in0
696	ldd		[$inp - 8], $in1	! load next input block
697	ldda		[$inp]0x82, $intail	! non-faulting load
698	add		$inp, $inc, $inp	! inp+=16
699
700	fmovd		%f0, %f4
701	faesdecx	%f2, %f10, %f0
702	faesdecx	%f4, %f12, %f2
703	ldd		[$key + 16], %f10	! round[1]
704	ldd		[$key + 24], %f12
705
706	fshiftorx	$in0, $in1, $fshift, $in0
707	fshiftorx	$in1, $intail, $fshift, $in1
708
709	fmovd		%f0, %f4
710	faesdeclx	%f2, %f6, %f0
711	faesdeclx	%f4, %f8, %f2
712
713	brnz,pn		$oalign, .Lcbc_dec_unaligned_out
714	nop
715
716	std		%f0, [$out + 0]
717	std		%f2, [$out + 8]
718	add		$out, 16, $out
719
720	brnz,a		$len, .Loop_cbc_dec
721	sub		$len, 1, $len
722
723	st		$iv0,    [$ivp + 0]	! output ivec
724	st		$iv0#lo, [$ivp + 4]
725	st		$iv1,    [$ivp + 8]
726	st		$iv1#lo, [$ivp + 12]
727
728	ret
729	restore
730
731.align	32
732.Lcbc_dec_unaligned_out:
733	ldd		[%o7 + $mask], $fshift	! shift right params
734	mov		0xff, $mask
735	srl		$mask, $oalign, $mask
736	sub		%g0, $ileft, $iright
737
738	fshiftorx	%f0, %f0, $fshift, %f6
739	fshiftorx	%f0, %f2, $fshift, %f8
740
741	stda		%f6, [$out + $mask]0xc0	! partial store
742	orn		%g0, $mask, $mask
743	std		%f8, [$out + 8]
744	add		$out, 16, $out
745	brz		$len, .Lcbc_dec_unaligned_out_done
746	sub		$len, 1, $len
747	b		.Loop_cbc_dec_unaligned_out
748	nop
749
750.align	32
751.Loop_cbc_dec_unaligned_out:
752	fmovd		%f2, $outhead
753	fxor		$in0, $r0hi, %f0	! inp^round[0]
754	fxor		$in1, $r0lo, %f2
755	ldd		[$key + 32], %f6	! round[2]
756	ldd		[$key + 40], %f8
757
758	fmovd		%f0, %f4
759	faesdecx	%f2, %f10, %f0
760	faesdecx	%f4, %f12, %f2
761	ldd		[$key + 48], %f10	! round[3]
762	ldd		[$key + 56], %f12
763
764	ldx		[$inp - 16], %o0
765	ldx		[$inp - 8], %o1
766	brz		$ileft, .Lcbc_dec_aligned_inp
767	movrz		$len, 0, $inc
768
769	ldx		[$inp], %o2
770	sllx		%o0, $ileft, %o0
771	srlx		%o1, $iright, %g1
772	sllx		%o1, $ileft, %o1
773	or		%g1, %o0, %o0
774	srlx		%o2, $iright, %o2
775	or		%o2, %o1, %o1
776
777.Lcbc_dec_aligned_inp:
778	fmovd		%f0, %f4
779	faesdecx	%f2, %f6, %f0
780	faesdecx	%f4, %f8, %f2
781	ldd		[$key + 64], %f6	! round[4]
782	ldd		[$key + 72], %f8
783	add		$key, 64, $end
784	sub		$rounds, 16*8, $inner
785
786	stx		%o0, [%sp + LOCALS + 0]
787	stx		%o1, [%sp + LOCALS + 8]
788	add		$inp, $inc, $inp	! inp+=16
789	nop
790
791.Lcbc_dec_unaligned:
792	fmovd		%f0, %f4
793	faesdecx	%f2, %f10, %f0
794	faesdecx	%f4, %f12, %f2
795	ldd		[$end + 16], %f10
796	ldd		[$end + 24], %f12
797	add		$end, 32, $end
798
799	fmovd		%f0, %f4
800	faesdecx	%f2, %f6, %f0
801	faesdecx	%f4, %f8, %f2
802	ldd		[$end + 0], %f6
803	ldd		[$end + 8], %f8
804
805	brnz,a		$inner, .Lcbc_dec_unaligned
806	sub		$inner, 16*2, $inner
807
808	fmovd		%f0, %f4
809	faesdecx	%f2, %f10, %f0
810	faesdecx	%f4, %f12, %f2
811	ldd		[$end + 16], %f10	! round[last-1]
812	ldd		[$end + 24], %f12
813
814	fmovd		%f0, %f4
815	faesdecx	%f2, %f6, %f0
816	faesdecx	%f4, %f8, %f2
817
818	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
819	fxor		$iv1, $rllo, %f8
820	fmovd		$in0, $iv0
821	fmovd		$in1, $iv1
822	ldd		[%sp + LOCALS + 0], $in0
823	ldd		[%sp + LOCALS + 8], $in1
824
825	fmovd		%f0, %f4
826	faesdecx	%f2, %f10, %f0
827	faesdecx	%f4, %f12, %f2
828	ldd		[$key + 16], %f10	! round[1]
829	ldd		[$key + 24], %f12
830
831	fmovd		%f0, %f4
832	faesdeclx	%f2, %f6, %f0
833	faesdeclx	%f4, %f8, %f2
834
835	fshiftorx	$outhead, %f0, $fshift, %f6
836	fshiftorx	%f0, %f2, $fshift, %f8
837	std		%f6, [$out + 0]
838	std		%f8, [$out + 8]
839	add		$out, 16, $out
840
841	brnz,a		$len, .Loop_cbc_dec_unaligned_out
842	sub		$len, 1, $len
843
844.Lcbc_dec_unaligned_out_done:
845	fshiftorx	%f2, %f2, $fshift, %f8
846	stda		%f8, [$out + $mask]0xc0	! partial store
847
848	st		$iv0,    [$ivp + 0]	! output ivec
849	st		$iv0#lo, [$ivp + 4]
850	st		$iv1,    [$ivp + 8]
851	st		$iv1#lo, [$ivp + 12]
852
853	ret
854	restore
855.type	aes_fx_cbc_encrypt,#function
856.size	aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
857___
858}
859{
860my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
861my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
862my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
863   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
864my ($ileft,$iright) = ($ialign, $oalign);
865my $one = "%f14";
866
867$code.=<<___;
868.globl	aes_fx_ctr32_encrypt_blocks
869.align	32
870aes_fx_ctr32_encrypt_blocks:
871	save		%sp, -STACK_FRAME-16, %sp
872	srln		$len, 0, $len
873	and		$inp, 7, $ialign
874	andn		$inp, 7, $inp
875	brz,pn		$len, .Lctr32_no_data
876	sll		$ialign, 3, $ileft
877
878.Lpic:	call		.+8
879	add		%o7, .Linp_align - .Lpic, %o7
880
881	ld		[$key + 240], $rounds
882	and		$out, 7, $oalign
883	ld		[$ivp +  0], $ctr0	! load counter
884	andn		$out, 7, $out
885	ld		[$ivp +  4], $ctr0#lo
886	sll		$oalign, 3, $mask
887	ld		[$ivp +  8], $ctr1
888	ld		[$ivp + 12], $ctr1#lo
889	ldd		[%o7 + 128], $one
890
891	sll		$rounds, 4, $rounds
892	add		$rounds, $key, $end
893	ldd		[$key + 0], $r0hi	! round[0]
894	ldd		[$key + 8], $r0lo
895
896	add		$inp, 16, $inp
897	sub		$len, 1, $len
898	ldd		[$key + 16], %f10	! round[1]
899	ldd		[$key + 24], %f12
900
901	mov		16, $inc
902	movrz		$len, 0, $inc
903	ldd		[$end + 0], $rlhi	! round[last]
904	ldd		[$end + 8], $rllo
905
906	ldd		[%o7 + $ileft], $fshift	! shiftleft params
907	add		%o7, 64, %o7
908	ldd		[$inp - 16], $in0	! load input
909	ldd		[$inp -  8], $in1
910	ldda		[$inp]0x82, $intail	! non-faulting load
911	add		$inp, $inc, $inp	! inp+=16
912
913	fshiftorx	$in0, $in1, $fshift, $in0
914	fshiftorx	$in1, $intail, $fshift, $in1
915
916.Loop_ctr32:
917	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
918	fxor		$ctr1, $r0lo, %f2
919	ldd		[$key + 32], %f6	! round[2]
920	ldd		[$key + 40], %f8
921	add		$key, 32, $end
922	sub		$rounds, 16*6, $inner
923
924.Lctr32_enc:
925	fmovd		%f0, %f4
926	faesencx	%f2, %f10, %f0
927	faesencx	%f4, %f12, %f2
928	ldd		[$end + 16], %f10
929	ldd		[$end + 24], %f12
930	add		$end, 32, $end
931
932	fmovd		%f0, %f4
933	faesencx	%f2, %f6, %f0
934	faesencx	%f4, %f8, %f2
935	ldd		[$end + 0], %f6
936	ldd		[$end + 8], %f8
937
938	brnz,a		$inner, .Lctr32_enc
939	sub		$inner, 16*2, $inner
940
941	fmovd		%f0, %f4
942	faesencx	%f2, %f10, %f0
943	faesencx	%f4, %f12, %f2
944	ldd		[$end + 16], %f10	! round[last-1]
945	ldd		[$end + 24], %f12
946
947	fmovd		%f0, %f4
948	faesencx	%f2, %f6, %f0
949	faesencx	%f4, %f8, %f2
950	fxor		$in0, $rlhi, %f6	! inp^round[last]
951	fxor		$in1, $rllo, %f8
952
953	movrz		$len, 0, $inc
954	fmovd		$intail, $in0
955	ldd		[$inp - 8], $in1	! load next input block
956	ldda		[$inp]0x82, $intail	! non-faulting load
957	add		$inp, $inc, $inp	! inp+=16
958
959	fmovd		%f0, %f4
960	faesencx	%f2, %f10, %f0
961	faesencx	%f4, %f12, %f2
962	ldd		[$key + 16], %f10	! round[1]
963	ldd		[$key + 24], %f12
964
965	fshiftorx	$in0, $in1, $fshift, $in0
966	fshiftorx	$in1, $intail, $fshift, $in1
967	fpadd32		$ctr1, $one, $ctr1	! increment counter
968
969	fmovd		%f0, %f4
970	faesenclx	%f2, %f6, %f0
971	faesenclx	%f4, %f8, %f2
972
973	brnz,pn		$oalign, .Lctr32_unaligned_out
974	nop
975
976	std		%f0, [$out + 0]
977	std		%f2, [$out + 8]
978	add		$out, 16, $out
979
980	brnz,a		$len, .Loop_ctr32
981	sub		$len, 1, $len
982
983.Lctr32_no_data:
984	ret
985	restore
986
987.align	32
988.Lctr32_unaligned_out:
989	ldd		[%o7 + $mask], $fshift	! shift right params
990	mov		0xff, $mask
991	srl		$mask, $oalign, $mask
992	sub		%g0, $ileft, $iright
993
994	fshiftorx	%f0, %f0, $fshift, %f6
995	fshiftorx	%f0, %f2, $fshift, %f8
996
997	stda		%f6, [$out + $mask]0xc0	! partial store
998	orn		%g0, $mask, $mask
999	std		%f8, [$out + 8]
1000	add		$out, 16, $out
1001	brz		$len, .Lctr32_unaligned_out_done
1002	sub		$len, 1, $len
1003	b		.Loop_ctr32_unaligned_out
1004	nop
1005
1006.align	32
1007.Loop_ctr32_unaligned_out:
1008	fmovd		%f2, $outhead
1009	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
1010	fxor		$ctr1, $r0lo, %f2
1011	ldd		[$key + 32], %f6	! round[2]
1012	ldd		[$key + 40], %f8
1013
1014	fmovd		%f0, %f4
1015	faesencx	%f2, %f10, %f0
1016	faesencx	%f4, %f12, %f2
1017	ldd		[$key + 48], %f10	! round[3]
1018	ldd		[$key + 56], %f12
1019
1020	ldx		[$inp - 16], %o0
1021	ldx		[$inp -  8], %o1
1022	brz		$ileft, .Lctr32_aligned_inp
1023	movrz		$len, 0, $inc
1024
1025	ldx		[$inp], %o2
1026	sllx		%o0, $ileft, %o0
1027	srlx		%o1, $iright, %g1
1028	sllx		%o1, $ileft, %o1
1029	or		%g1, %o0, %o0
1030	srlx		%o2, $iright, %o2
1031	or		%o2, %o1, %o1
1032
1033.Lctr32_aligned_inp:
1034	fmovd		%f0, %f4
1035	faesencx	%f2, %f6, %f0
1036	faesencx	%f4, %f8, %f2
1037	ldd		[$key + 64], %f6	! round[4]
1038	ldd		[$key + 72], %f8
1039	add		$key, 64, $end
1040	sub		$rounds, 16*8, $inner
1041
1042	stx		%o0, [%sp + LOCALS + 0]
1043	stx		%o1, [%sp + LOCALS + 8]
1044	add		$inp, $inc, $inp	! inp+=16
1045	nop
1046
1047.Lctr32_enc_unaligned:
1048	fmovd		%f0, %f4
1049	faesencx	%f2, %f10, %f0
1050	faesencx	%f4, %f12, %f2
1051	ldd		[$end + 16], %f10
1052	ldd		[$end + 24], %f12
1053	add		$end, 32, $end
1054
1055	fmovd		%f0, %f4
1056	faesencx	%f2, %f6, %f0
1057	faesencx	%f4, %f8, %f2
1058	ldd		[$end + 0], %f6
1059	ldd		[$end + 8], %f8
1060
1061	brnz,a		$inner, .Lctr32_enc_unaligned
1062	sub		$inner, 16*2, $inner
1063
1064	fmovd		%f0, %f4
1065	faesencx	%f2, %f10, %f0
1066	faesencx	%f4, %f12, %f2
1067	ldd		[$end + 16], %f10	! round[last-1]
1068	ldd		[$end + 24], %f12
1069	fpadd32		$ctr1, $one, $ctr1	! increment counter
1070
1071	fmovd		%f0, %f4
1072	faesencx	%f2, %f6, %f0
1073	faesencx	%f4, %f8, %f2
1074	fxor		$in0, $rlhi, %f6	! inp^round[last]
1075	fxor		$in1, $rllo, %f8
1076	ldd		[%sp + LOCALS + 0], $in0
1077	ldd		[%sp + LOCALS + 8], $in1
1078
1079	fmovd		%f0, %f4
1080	faesencx	%f2, %f10, %f0
1081	faesencx	%f4, %f12, %f2
1082	ldd		[$key + 16], %f10	! round[1]
1083	ldd		[$key + 24], %f12
1084
1085	fmovd		%f0, %f4
1086	faesenclx	%f2, %f6, %f0
1087	faesenclx	%f4, %f8, %f2
1088
1089	fshiftorx	$outhead, %f0, $fshift, %f6
1090	fshiftorx	%f0, %f2, $fshift, %f8
1091	std		%f6, [$out + 0]
1092	std		%f8, [$out + 8]
1093	add		$out, 16, $out
1094
1095	brnz,a		$len, .Loop_ctr32_unaligned_out
1096	sub		$len, 1, $len
1097
1098.Lctr32_unaligned_out_done:
1099	fshiftorx	%f2, %f2, $fshift, %f8
1100	stda		%f8, [$out + $mask]0xc0	! partial store
1101
1102	ret
1103	restore
1104.type	aes_fx_ctr32_encrypt_blocks,#function
1105.size	aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1106
1107.align	32
1108.Linp_align:		! fshiftorx parameters for left shift toward %rs1
1109	.byte	0, 0, 64,  0,	0, 64,  0, -64
1110	.byte	0, 0, 56,  8,	0, 56,  8, -56
1111	.byte	0, 0, 48, 16,	0, 48, 16, -48
1112	.byte	0, 0, 40, 24,	0, 40, 24, -40
1113	.byte	0, 0, 32, 32,	0, 32, 32, -32
1114	.byte	0, 0, 24, 40,	0, 24, 40, -24
1115	.byte	0, 0, 16, 48,	0, 16, 48, -16
1116	.byte	0, 0,  8, 56,	0,  8, 56, -8
1117.Lout_align:		! fshiftorx parameters for right shift toward %rs2
1118	.byte	0, 0,  0, 64,	0,  0, 64,   0
1119	.byte	0, 0,  8, 56,	0,  8, 56,  -8
1120	.byte	0, 0, 16, 48,	0, 16, 48, -16
1121	.byte	0, 0, 24, 40,	0, 24, 40, -24
1122	.byte	0, 0, 32, 32,	0, 32, 32, -32
1123	.byte	0, 0, 40, 24,	0, 40, 24, -40
1124	.byte	0, 0, 48, 16,	0, 48, 16, -48
1125	.byte	0, 0, 56,  8,	0, 56,  8, -56
1126.Lone:
1127	.word	0, 1
1128.asciz	"AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1129.align	4
1130___
1131}
1132# Purpose of these subroutines is to explicitly encode VIS instructions,
1133# so that one can compile the module without having to specify VIS
1134# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1135# Idea is to reserve for option to produce "universal" binary and let
1136# programmer detect if current CPU is VIS capable at run-time.
1137sub unvis {
1138my ($mnemonic,$rs1,$rs2,$rd)=@_;
1139my ($ref,$opf);
1140my %visopf = (	"faligndata"	=> 0x048,
1141		"bshuffle"	=> 0x04c,
1142		"fpadd32"	=> 0x052,
1143		"fxor"		=> 0x06c,
1144		"fsrc2"		=> 0x078	);
1145
1146    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1147
1148    if ($opf=$visopf{$mnemonic}) {
1149	foreach ($rs1,$rs2,$rd) {
1150	    return $ref if (!/%f([0-9]{1,2})/);
1151	    $_=$1;
1152	    if ($1>=32) {
1153		return $ref if ($1&1);
1154		# re-encode for upper double register addressing
1155		$_=($1|$1>>5)&31;
1156	    }
1157	}
1158
1159	return	sprintf ".word\t0x%08x !%s",
1160			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1161			$ref;
1162    } else {
1163	return $ref;
1164    }
1165}
1166
1167sub unvis3 {
1168my ($mnemonic,$rs1,$rs2,$rd)=@_;
1169my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1170my ($ref,$opf);
1171my %visopf = (	"alignaddr"	=> 0x018,
1172		"bmask"		=> 0x019,
1173		"alignaddrl"	=> 0x01a	);
1174
1175    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1176
1177    if ($opf=$visopf{$mnemonic}) {
1178	foreach ($rs1,$rs2,$rd) {
1179	    return $ref if (!/%([goli])([0-9])/);
1180	    $_=$bias{$1}+$2;
1181	}
1182
1183	return	sprintf ".word\t0x%08x !%s",
1184			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1185			$ref;
1186    } else {
1187	return $ref;
1188    }
1189}
1190
1191sub unfx {
1192my ($mnemonic,$rs1,$rs2,$rd)=@_;
1193my ($ref,$opf);
1194my %aesopf = (	"faesencx"	=> 0x90,
1195		"faesdecx"	=> 0x91,
1196		"faesenclx"	=> 0x92,
1197		"faesdeclx"	=> 0x93,
1198		"faeskeyx"	=> 0x94	);
1199
1200    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1201
1202    if (defined($opf=$aesopf{$mnemonic})) {
1203	$rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
1204	$rs2 = oct($rs2) if ($rs2 =~ /^0/);
1205
1206	foreach ($rs1,$rd) {
1207	    return $ref if (!/%f([0-9]{1,2})/);
1208	    $_=$1;
1209	    if ($1>=32) {
1210		return $ref if ($1&1);
1211		# re-encode for upper double register addressing
1212		$_=($1|$1>>5)&31;
1213	    }
1214	}
1215
1216	return	sprintf ".word\t0x%08x !%s",
1217			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1218			$ref;
1219    } else {
1220	return $ref;
1221    }
1222}
1223
1224sub unfx3src {
1225my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1226my ($ref,$opf);
1227my %aesopf = (	"fshiftorx"	=> 0x0b	);
1228
1229    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1230
1231    if (defined($opf=$aesopf{$mnemonic})) {
1232	foreach ($rs1,$rs2,$rs3,$rd) {
1233	    return $ref if (!/%f([0-9]{1,2})/);
1234	    $_=$1;
1235	    if ($1>=32) {
1236		return $ref if ($1&1);
1237		# re-encode for upper double register addressing
1238		$_=($1|$1>>5)&31;
1239	    }
1240	}
1241
1242	return	sprintf ".word\t0x%08x !%s",
1243			2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1244			$ref;
1245    } else {
1246	return $ref;
1247    }
1248}
1249
1250foreach (split("\n",$code)) {
1251    s/\`([^\`]*)\`/eval $1/ge;
1252
1253    s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1254
1255    s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1256		&unfx($1,$2,$3,$4)
1257     /ge or
1258    s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1259		&unfx3src($1,$2,$3,$4,$5)
1260     /ge or
1261    s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1262		&unvis($1,$2,$3,$4)
1263     /ge or
1264    s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1265		&unvis3($1,$2,$3,$4)
1266     /ge;
1267    print $_,"\n";
1268}
1269
1270close STDOUT or die "error closing STDOUT: $!";
1271