1#! /usr/bin/env perl
2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by David S. Miller and Andy Polyakov.
12# The module is licensed under 2-clause BSD
13# license. October 2012. All rights reserved.
14# ====================================================================
15
16######################################################################
17# Camellia for SPARC T4.
18#
19# As with AES below results [for aligned data] are virtually identical
20# to critical path lengths for 3-cycle instruction latency:
21#
22#		128-bit key	192/256-
23# CBC encrypt	4.14/4.21(*)	5.46/5.52
24#			 (*) numbers after slash are for
25#			     misaligned data.
26#
27# As with Intel AES-NI, question is if it's possible to improve
28# performance of parallelizable modes by interleaving round
29# instructions. In Camellia every instruction is dependent on
30# previous, which means that there is place for 2 additional ones
31# in between two dependent. Can we expect 3x performance improvement?
32# At least one can argue that it should be possible to break 2x
33# barrier... For some reason not even 2x appears to be possible:
34#
35#		128-bit key	192/256-
36# CBC decrypt	2.21/2.74	2.99/3.40
37# CTR		2.15/2.68(*)	2.93/3.34
38#			 (*) numbers after slash are for
39#			     misaligned data.
40#
41# This is for 2x interleave. But compared to 1x interleave CBC decrypt
42# improved by ... 0% for 128-bit key, and 11% for 192/256-bit one.
43# So that out-of-order execution logic can take non-interleaved code
44# to 1.87x, but can't take 2x interleaved one any further. There
45# surely is some explanation... As result 3x interleave was not even
46# attempted. Instead an effort was made to share specific modes
47# implementations with AES module (therefore sparct4_modes.pl).
48#
49# To anchor to something else, software C implementation processes
50# one byte in 38 cycles with 128-bit key on same processor.
51
52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53push(@INC,"${dir}","${dir}../../perlasm");
54require "sparcv9_modes.pl";
55
56$output = pop;
57open STDOUT,">$output";
58
59$::evp=1;	# if $evp is set to 0, script generates module with
60# Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt
61# entry points. These are fully compatible with openssl/camellia.h.
62
63######################################################################
64# single-round subroutines
65#
66{
67my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
68
69$code=<<___;
70#include "sparc_arch.h"
71
72.text
73
74.globl	cmll_t4_encrypt
75.align	32
76cmll_t4_encrypt:
77	andcc		$inp, 7, %g1		! is input aligned?
78	andn		$inp, 7, $inp
79
80	ldx		[$key + 0], %g4
81	ldx		[$key + 8], %g5
82
83	ldx		[$inp + 0], %o4
84	bz,pt		%icc, 1f
85	ldx		[$inp + 8], %o5
86	ldx		[$inp + 16], $inp
87	sll		%g1, 3, %g1
88	sub		%g0, %g1, %o3
89	sllx		%o4, %g1, %o4
90	sllx		%o5, %g1, %g1
91	srlx		%o5, %o3, %o5
92	srlx		$inp, %o3, %o3
93	or		%o5, %o4, %o4
94	or		%o3, %g1, %o5
951:
96	ld		[$key + 272], $rounds	! grandRounds, 3 or 4
97	ldd		[$key + 16], %f12
98	ldd		[$key + 24], %f14
99	xor		%g4, %o4, %o4
100	xor		%g5, %o5, %o5
101	ldd		[$key + 32], %f16
102	ldd		[$key + 40], %f18
103	movxtod		%o4, %f0
104	movxtod		%o5, %f2
105	ldd		[$key + 48], %f20
106	ldd		[$key + 56], %f22
107	sub		$rounds, 1, $rounds
108	ldd		[$key + 64], %f24
109	ldd		[$key + 72], %f26
110	add		$key, 80, $key
111
112.Lenc:
113	camellia_f	%f12, %f2, %f0, %f2
114	ldd		[$key + 0], %f12
115	sub		$rounds,1,$rounds
116	camellia_f	%f14, %f0, %f2, %f0
117	ldd		[$key + 8], %f14
118	camellia_f	%f16, %f2, %f0, %f2
119	ldd		[$key + 16], %f16
120	camellia_f	%f18, %f0, %f2, %f0
121	ldd		[$key + 24], %f18
122	camellia_f	%f20, %f2, %f0, %f2
123	ldd		[$key + 32], %f20
124	camellia_f	%f22, %f0, %f2, %f0
125	ldd		[$key + 40], %f22
126	camellia_fl	%f24, %f0, %f0
127	ldd		[$key + 48], %f24
128	camellia_fli	%f26, %f2, %f2
129	ldd		[$key + 56], %f26
130	brnz,pt		$rounds, .Lenc
131	add		$key, 64, $key
132
133	andcc		$out, 7, $tmp		! is output aligned?
134	camellia_f	%f12, %f2, %f0, %f2
135	camellia_f	%f14, %f0, %f2, %f0
136	camellia_f	%f16, %f2, %f0, %f2
137	camellia_f	%f18, %f0, %f2, %f0
138	camellia_f	%f20, %f2, %f0, %f4
139	camellia_f	%f22, %f0, %f4, %f2
140	fxor		%f24, %f4, %f0
141	fxor		%f26, %f2, %f2
142
143	bnz,pn		%icc, 2f
144	nop
145
146	std		%f0, [$out + 0]
147	retl
148	std		%f2, [$out + 8]
149
1502:	alignaddrl	$out, %g0, $out
151	mov		0xff, $mask
152	srl		$mask, $tmp, $mask
153
154	faligndata	%f0, %f0, %f4
155	faligndata	%f0, %f2, %f6
156	faligndata	%f2, %f2, %f8
157
158	stda		%f4, [$out + $mask]0xc0	! partial store
159	std		%f6, [$out + 8]
160	add		$out, 16, $out
161	orn		%g0, $mask, $mask
162	retl
163	stda		%f8, [$out + $mask]0xc0	! partial store
164.type	cmll_t4_encrypt,#function
165.size	cmll_t4_encrypt,.-cmll_t4_encrypt
166
167.globl	cmll_t4_decrypt
168.align	32
169cmll_t4_decrypt:
170	ld		[$key + 272], $rounds	! grandRounds, 3 or 4
171	andcc		$inp, 7, %g1		! is input aligned?
172	andn		$inp, 7, $inp
173
174	sll		$rounds, 6, $rounds
175	add		$rounds, $key, $key
176
177	ldx		[$inp + 0], %o4
178	bz,pt		%icc, 1f
179	ldx		[$inp + 8], %o5
180	ldx		[$inp + 16], $inp
181	sll		%g1, 3, %g1
182	sub		%g0, %g1, %g4
183	sllx		%o4, %g1, %o4
184	sllx		%o5, %g1, %g1
185	srlx		%o5, %g4, %o5
186	srlx		$inp, %g4, %g4
187	or		%o5, %o4, %o4
188	or		%g4, %g1, %o5
1891:
190	ldx		[$key + 0], %g4
191	ldx		[$key + 8], %g5
192	ldd		[$key - 8], %f12
193	ldd		[$key - 16], %f14
194	xor		%g4, %o4, %o4
195	xor		%g5, %o5, %o5
196	ldd		[$key - 24], %f16
197	ldd		[$key - 32], %f18
198	movxtod		%o4, %f0
199	movxtod		%o5, %f2
200	ldd		[$key - 40], %f20
201	ldd		[$key - 48], %f22
202	sub		$rounds, 64, $rounds
203	ldd		[$key - 56], %f24
204	ldd		[$key - 64], %f26
205	sub		$key, 64, $key
206
207.Ldec:
208	camellia_f	%f12, %f2, %f0, %f2
209	ldd		[$key - 8], %f12
210	sub		$rounds, 64, $rounds
211	camellia_f	%f14, %f0, %f2, %f0
212	ldd		[$key - 16], %f14
213	camellia_f	%f16, %f2, %f0, %f2
214	ldd		[$key - 24], %f16
215	camellia_f	%f18, %f0, %f2, %f0
216	ldd		[$key - 32], %f18
217	camellia_f	%f20, %f2, %f0, %f2
218	ldd		[$key - 40], %f20
219	camellia_f	%f22, %f0, %f2, %f0
220	ldd		[$key - 48], %f22
221	camellia_fl	%f24, %f0, %f0
222	ldd		[$key - 56], %f24
223	camellia_fli	%f26, %f2, %f2
224	ldd		[$key - 64], %f26
225	brnz,pt		$rounds, .Ldec
226	sub		$key, 64, $key
227
228	andcc		$out, 7, $tmp		! is output aligned?
229	camellia_f	%f12, %f2, %f0, %f2
230	camellia_f	%f14, %f0, %f2, %f0
231	camellia_f	%f16, %f2, %f0, %f2
232	camellia_f	%f18, %f0, %f2, %f0
233	camellia_f	%f20, %f2, %f0, %f4
234	camellia_f	%f22, %f0, %f4, %f2
235	fxor		%f26, %f4, %f0
236	fxor		%f24, %f2, %f2
237
238	bnz,pn		%icc, 2f
239	nop
240
241	std		%f0, [$out + 0]
242	retl
243	std		%f2, [$out + 8]
244
2452:	alignaddrl	$out, %g0, $out
246	mov		0xff, $mask
247	srl		$mask, $tmp, $mask
248
249	faligndata	%f0, %f0, %f4
250	faligndata	%f0, %f2, %f6
251	faligndata	%f2, %f2, %f8
252
253	stda		%f4, [$out + $mask]0xc0	! partial store
254	std		%f6, [$out + 8]
255	add		$out, 16, $out
256	orn		%g0, $mask, $mask
257	retl
258	stda		%f8, [$out + $mask]0xc0	! partial store
259.type	cmll_t4_decrypt,#function
260.size	cmll_t4_decrypt,.-cmll_t4_decrypt
261___
262}
263
264######################################################################
265# key setup subroutines
266#
267{
268sub ROTL128 {
269  my $rot = shift;
270
271	"srlx	%o4, 64-$rot, %g4\n\t".
272	"sllx	%o4, $rot, %o4\n\t".
273	"srlx	%o5, 64-$rot, %g5\n\t".
274	"sllx	%o5, $rot, %o5\n\t".
275	"or	%o4, %g5, %o4\n\t".
276	"or	%o5, %g4, %o5";
277}
278
279my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
280$code.=<<___;
281.globl	cmll_t4_set_key
282.align	32
283cmll_t4_set_key:
284	and		$inp, 7, $tmp
285	alignaddr	$inp, %g0, $inp
286	cmp		$bits, 192
287	ldd		[$inp + 0], %f0
288	bl,pt		%icc,.L128
289	ldd		[$inp + 8], %f2
290
291	be,pt		%icc,.L192
292	ldd		[$inp + 16], %f4
293
294	brz,pt		$tmp, .L256aligned
295	ldd		[$inp + 24], %f6
296
297	ldd		[$inp + 32], %f8
298	faligndata	%f0, %f2, %f0
299	faligndata	%f2, %f4, %f2
300	faligndata	%f4, %f6, %f4
301	b		.L256aligned
302	faligndata	%f6, %f8, %f6
303
304.align	16
305.L192:
306	brz,a,pt	$tmp, .L256aligned
307	fnot2		%f4, %f6
308
309	ldd		[$inp + 24], %f6
310	nop
311	faligndata	%f0, %f2, %f0
312	faligndata	%f2, %f4, %f2
313	faligndata	%f4, %f6, %f4
314	fnot2		%f4, %f6
315
316.L256aligned:
317	std		%f0, [$out + 0]		! k[0, 1]
318	fsrc2		%f0, %f28
319	std		%f2, [$out + 8]		! k[2, 3]
320	fsrc2		%f2, %f30
321	fxor		%f4, %f0, %f0
322	b		.L128key
323	fxor		%f6, %f2, %f2
324
325.align	16
326.L128:
327	brz,pt		$tmp, .L128aligned
328	nop
329
330	ldd		[$inp + 16], %f4
331	nop
332	faligndata	%f0, %f2, %f0
333	faligndata	%f2, %f4, %f2
334
335.L128aligned:
336	std		%f0, [$out + 0]		! k[0, 1]
337	fsrc2		%f0, %f28
338	std		%f2, [$out + 8]		! k[2, 3]
339	fsrc2		%f2, %f30
340
341.L128key:
342	mov		%o7, %o5
3431:	call		.+8
344	add		%o7, SIGMA-1b, %o4
345	mov		%o5, %o7
346
347	ldd		[%o4 + 0], %f16
348	ldd		[%o4 + 8], %f18
349	ldd		[%o4 + 16], %f20
350	ldd		[%o4 + 24], %f22
351
352	camellia_f	%f16, %f2, %f0, %f2
353	camellia_f	%f18, %f0, %f2, %f0
354	fxor		%f28, %f0, %f0
355	fxor		%f30, %f2, %f2
356	camellia_f	%f20, %f2, %f0, %f2
357	camellia_f	%f22, %f0, %f2, %f0
358
359	bge,pn		%icc, .L256key
360	nop
361	std	%f0, [$out + 0x10]	! k[ 4,  5]
362	std	%f2, [$out + 0x18]	! k[ 6,  7]
363
364	movdtox	%f0, %o4
365	movdtox	%f2, %o5
366	`&ROTL128(15)`
367	stx	%o4, [$out + 0x30]	! k[12, 13]
368	stx	%o5, [$out + 0x38]	! k[14, 15]
369	`&ROTL128(15)`
370	stx	%o4, [$out + 0x40]	! k[16, 17]
371	stx	%o5, [$out + 0x48]	! k[18, 19]
372	`&ROTL128(15)`
373	stx	%o4, [$out + 0x60]	! k[24, 25]
374	`&ROTL128(15)`
375	stx	%o4, [$out + 0x70]	! k[28, 29]
376	stx	%o5, [$out + 0x78]	! k[30, 31]
377	`&ROTL128(34)`
378	stx	%o4, [$out + 0xa0]	! k[40, 41]
379	stx	%o5, [$out + 0xa8]	! k[42, 43]
380	`&ROTL128(17)`
381	stx	%o4, [$out + 0xc0]	! k[48, 49]
382	stx	%o5, [$out + 0xc8]	! k[50, 51]
383
384	movdtox	%f28, %o4		! k[ 0,  1]
385	movdtox	%f30, %o5		! k[ 2,  3]
386	`&ROTL128(15)`
387	stx	%o4, [$out + 0x20]	! k[ 8,  9]
388	stx	%o5, [$out + 0x28]	! k[10, 11]
389	`&ROTL128(30)`
390	stx	%o4, [$out + 0x50]	! k[20, 21]
391	stx	%o5, [$out + 0x58]	! k[22, 23]
392	`&ROTL128(15)`
393	stx	%o5, [$out + 0x68]	! k[26, 27]
394	`&ROTL128(17)`
395	stx	%o4, [$out + 0x80]	! k[32, 33]
396	stx	%o5, [$out + 0x88]	! k[34, 35]
397	`&ROTL128(17)`
398	stx	%o4, [$out + 0x90]	! k[36, 37]
399	stx	%o5, [$out + 0x98]	! k[38, 39]
400	`&ROTL128(17)`
401	stx	%o4, [$out + 0xb0]	! k[44, 45]
402	stx	%o5, [$out + 0xb8]	! k[46, 47]
403
404	mov		3, $tmp
405	st		$tmp, [$out + 0x110]
406	retl
407	xor		%o0, %o0, %o0
408
409.align	16
410.L256key:
411	ldd		[%o4 + 32], %f24
412	ldd		[%o4 + 40], %f26
413
414	std		%f0, [$out + 0x30]	! k[12, 13]
415	std		%f2, [$out + 0x38]	! k[14, 15]
416
417	fxor		%f4, %f0, %f0
418	fxor		%f6, %f2, %f2
419	camellia_f	%f24, %f2, %f0, %f2
420	camellia_f	%f26, %f0, %f2, %f0
421
422	std	%f0, [$out + 0x10]	! k[ 4,  5]
423	std	%f2, [$out + 0x18]	! k[ 6,  7]
424
425	movdtox	%f0, %o4
426	movdtox	%f2, %o5
427	`&ROTL128(30)`
428	stx	%o4, [$out + 0x50]	! k[20, 21]
429	stx	%o5, [$out + 0x58]	! k[22, 23]
430	`&ROTL128(30)`
431	stx	%o4, [$out + 0xa0]	! k[40, 41]
432	stx	%o5, [$out + 0xa8]	! k[42, 43]
433	`&ROTL128(51)`
434	stx	%o4, [$out + 0x100]	! k[64, 65]
435	stx	%o5, [$out + 0x108]	! k[66, 67]
436
437	movdtox	%f4, %o4		! k[ 8,  9]
438	movdtox	%f6, %o5		! k[10, 11]
439	`&ROTL128(15)`
440	stx	%o4, [$out + 0x20]	! k[ 8,  9]
441	stx	%o5, [$out + 0x28]	! k[10, 11]
442	`&ROTL128(15)`
443	stx	%o4, [$out + 0x40]	! k[16, 17]
444	stx	%o5, [$out + 0x48]	! k[18, 19]
445	`&ROTL128(30)`
446	stx	%o4, [$out + 0x90]	! k[36, 37]
447	stx	%o5, [$out + 0x98]	! k[38, 39]
448	`&ROTL128(34)`
449	stx	%o4, [$out + 0xd0]	! k[52, 53]
450	stx	%o5, [$out + 0xd8]	! k[54, 55]
451	ldx	[$out + 0x30], %o4	! k[12, 13]
452	ldx	[$out + 0x38], %o5	! k[14, 15]
453	`&ROTL128(15)`
454	stx	%o4, [$out + 0x30]	! k[12, 13]
455	stx	%o5, [$out + 0x38]	! k[14, 15]
456	`&ROTL128(30)`
457	stx	%o4, [$out + 0x70]	! k[28, 29]
458	stx	%o5, [$out + 0x78]	! k[30, 31]
459	srlx	%o4, 32, %g4
460	srlx	%o5, 32, %g5
461	st	%o4, [$out + 0xc0]	! k[48]
462	st	%g5, [$out + 0xc4]	! k[49]
463	st	%o5, [$out + 0xc8]	! k[50]
464	st	%g4, [$out + 0xcc]	! k[51]
465	`&ROTL128(49)`
466	stx	%o4, [$out + 0xe0]	! k[56, 57]
467	stx	%o5, [$out + 0xe8]	! k[58, 59]
468
469	movdtox	%f28, %o4		! k[ 0,  1]
470	movdtox	%f30, %o5		! k[ 2,  3]
471	`&ROTL128(45)`
472	stx	%o4, [$out + 0x60]	! k[24, 25]
473	stx	%o5, [$out + 0x68]	! k[26, 27]
474	`&ROTL128(15)`
475	stx	%o4, [$out + 0x80]	! k[32, 33]
476	stx	%o5, [$out + 0x88]	! k[34, 35]
477	`&ROTL128(17)`
478	stx	%o4, [$out + 0xb0]	! k[44, 45]
479	stx	%o5, [$out + 0xb8]	! k[46, 47]
480	`&ROTL128(34)`
481	stx	%o4, [$out + 0xf0]	! k[60, 61]
482	stx	%o5, [$out + 0xf8]	! k[62, 63]
483
484	mov		4, $tmp
485	st		$tmp, [$out + 0x110]
486	retl
487	xor		%o0, %o0, %o0
488.type	cmll_t4_set_key,#function
489.size	cmll_t4_set_key,.-cmll_t4_set_key
490.align	32
491SIGMA:
492	.long	0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2
493	.long	0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c
494	.long	0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd
495.type	SIGMA,#object
496.size	SIGMA,.-SIGMA
497.asciz	"Camellia for SPARC T4, David S. Miller, Andy Polyakov"
498___
499}
500
501{{{
502my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
503my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
504
505$code.=<<___;
506.align	32
507_cmll128_load_enckey:
508	ldx		[$key + 0], %g4
509	ldx		[$key + 8], %g5
510___
511for ($i=2; $i<26;$i++) {			# load key schedule
512    $code.=<<___;
513	ldd		[$key + `8*$i`], %f`12+2*$i`
514___
515}
516$code.=<<___;
517	retl
518	nop
519.type	_cmll128_load_enckey,#function
520.size	_cmll128_load_enckey,.-_cmll128_load_enckey
521_cmll256_load_enckey=_cmll128_load_enckey
522
523.align	32
524_cmll256_load_deckey:
525	ldd		[$key + 64], %f62
526	ldd		[$key + 72], %f60
527	b		.Load_deckey
528	add		$key, 64, $key
529_cmll128_load_deckey:
530	ldd		[$key + 0], %f60
531	ldd		[$key + 8], %f62
532.Load_deckey:
533___
534for ($i=2; $i<24;$i++) {			# load key schedule
535    $code.=<<___;
536	ldd		[$key + `8*$i`], %f`62-2*$i`
537___
538}
539$code.=<<___;
540	ldx		[$key + 192], %g4
541	retl
542	ldx		[$key + 200], %g5
543.type	_cmll256_load_deckey,#function
544.size	_cmll256_load_deckey,.-_cmll256_load_deckey
545
546.align	32
547_cmll128_encrypt_1x:
548___
549for ($i=0; $i<3; $i++) {
550    $code.=<<___;
551	camellia_f	%f`16+16*$i+0`, %f2, %f0, %f2
552	camellia_f	%f`16+16*$i+2`, %f0, %f2, %f0
553	camellia_f	%f`16+16*$i+4`, %f2, %f0, %f2
554	camellia_f	%f`16+16*$i+6`, %f0, %f2, %f0
555___
556$code.=<<___ if ($i<2);
557	camellia_f	%f`16+16*$i+8`, %f2, %f0, %f2
558	camellia_f	%f`16+16*$i+10`, %f0, %f2, %f0
559	camellia_fl	%f`16+16*$i+12`, %f0,      %f0
560	camellia_fli	%f`16+16*$i+14`, %f2,      %f2
561___
562}
563$code.=<<___;
564	camellia_f	%f56, %f2, %f0, %f4
565	camellia_f	%f58, %f0, %f4, %f2
566	fxor		%f60, %f4, %f0
567	retl
568	fxor		%f62, %f2, %f2
569.type	_cmll128_encrypt_1x,#function
570.size	_cmll128_encrypt_1x,.-_cmll128_encrypt_1x
571_cmll128_decrypt_1x=_cmll128_encrypt_1x
572
573.align	32
574_cmll128_encrypt_2x:
575___
576for ($i=0; $i<3; $i++) {
577    $code.=<<___;
578	camellia_f	%f`16+16*$i+0`, %f2, %f0, %f2
579	camellia_f	%f`16+16*$i+0`, %f6, %f4, %f6
580	camellia_f	%f`16+16*$i+2`, %f0, %f2, %f0
581	camellia_f	%f`16+16*$i+2`, %f4, %f6, %f4
582	camellia_f	%f`16+16*$i+4`, %f2, %f0, %f2
583	camellia_f	%f`16+16*$i+4`, %f6, %f4, %f6
584	camellia_f	%f`16+16*$i+6`, %f0, %f2, %f0
585	camellia_f	%f`16+16*$i+6`, %f4, %f6, %f4
586___
587$code.=<<___ if ($i<2);
588	camellia_f	%f`16+16*$i+8`, %f2, %f0, %f2
589	camellia_f	%f`16+16*$i+8`, %f6, %f4, %f6
590	camellia_f	%f`16+16*$i+10`, %f0, %f2, %f0
591	camellia_f	%f`16+16*$i+10`, %f4, %f6, %f4
592	camellia_fl	%f`16+16*$i+12`, %f0,      %f0
593	camellia_fl	%f`16+16*$i+12`, %f4,      %f4
594	camellia_fli	%f`16+16*$i+14`, %f2,      %f2
595	camellia_fli	%f`16+16*$i+14`, %f6,      %f6
596___
597}
598$code.=<<___;
599	camellia_f	%f56, %f2, %f0, %f8
600	camellia_f	%f56, %f6, %f4, %f10
601	camellia_f	%f58, %f0, %f8, %f2
602	camellia_f	%f58, %f4, %f10, %f6
603	fxor		%f60, %f8, %f0
604	fxor		%f60, %f10, %f4
605	fxor		%f62, %f2, %f2
606	retl
607	fxor		%f62, %f6, %f6
608.type	_cmll128_encrypt_2x,#function
609.size	_cmll128_encrypt_2x,.-_cmll128_encrypt_2x
610_cmll128_decrypt_2x=_cmll128_encrypt_2x
611
612.align	32
613_cmll256_encrypt_1x:
614	camellia_f	%f16, %f2, %f0, %f2
615	camellia_f	%f18, %f0, %f2, %f0
616	ldd		[$key + 208], %f16
617	ldd		[$key + 216], %f18
618	camellia_f	%f20, %f2, %f0, %f2
619	camellia_f	%f22, %f0, %f2, %f0
620	ldd		[$key + 224], %f20
621	ldd		[$key + 232], %f22
622	camellia_f	%f24, %f2, %f0, %f2
623	camellia_f	%f26, %f0, %f2, %f0
624	ldd		[$key + 240], %f24
625	ldd		[$key + 248], %f26
626	camellia_fl	%f28, %f0, %f0
627	camellia_fli	%f30, %f2, %f2
628	ldd		[$key + 256], %f28
629	ldd		[$key + 264], %f30
630___
631for ($i=1; $i<3; $i++) {
632    $code.=<<___;
633	camellia_f	%f`16+16*$i+0`, %f2, %f0, %f2
634	camellia_f	%f`16+16*$i+2`, %f0, %f2, %f0
635	camellia_f	%f`16+16*$i+4`, %f2, %f0, %f2
636	camellia_f	%f`16+16*$i+6`, %f0, %f2, %f0
637	camellia_f	%f`16+16*$i+8`, %f2, %f0, %f2
638	camellia_f	%f`16+16*$i+10`, %f0, %f2, %f0
639	camellia_fl	%f`16+16*$i+12`, %f0,      %f0
640	camellia_fli	%f`16+16*$i+14`, %f2,      %f2
641___
642}
643$code.=<<___;
644	camellia_f	%f16, %f2, %f0, %f2
645	camellia_f	%f18, %f0, %f2, %f0
646	ldd		[$key + 16], %f16
647	ldd		[$key + 24], %f18
648	camellia_f	%f20, %f2, %f0, %f2
649	camellia_f	%f22, %f0, %f2, %f0
650	ldd		[$key + 32], %f20
651	ldd		[$key + 40], %f22
652	camellia_f	%f24, %f2, %f0, %f4
653	camellia_f	%f26, %f0, %f4, %f2
654	ldd		[$key + 48], %f24
655	ldd		[$key + 56], %f26
656	fxor		%f28, %f4, %f0
657	fxor		%f30, %f2, %f2
658	ldd		[$key + 64], %f28
659	retl
660	ldd		[$key + 72], %f30
661.type	_cmll256_encrypt_1x,#function
662.size	_cmll256_encrypt_1x,.-_cmll256_encrypt_1x
663
664.align	32
665_cmll256_encrypt_2x:
666	camellia_f	%f16, %f2, %f0, %f2
667	camellia_f	%f16, %f6, %f4, %f6
668	camellia_f	%f18, %f0, %f2, %f0
669	camellia_f	%f18, %f4, %f6, %f4
670	ldd		[$key + 208], %f16
671	ldd		[$key + 216], %f18
672	camellia_f	%f20, %f2, %f0, %f2
673	camellia_f	%f20, %f6, %f4, %f6
674	camellia_f	%f22, %f0, %f2, %f0
675	camellia_f	%f22, %f4, %f6, %f4
676	ldd		[$key + 224], %f20
677	ldd		[$key + 232], %f22
678	camellia_f	%f24, %f2, %f0, %f2
679	camellia_f	%f24, %f6, %f4, %f6
680	camellia_f	%f26, %f0, %f2, %f0
681	camellia_f	%f26, %f4, %f6, %f4
682	ldd		[$key + 240], %f24
683	ldd		[$key + 248], %f26
684	camellia_fl	%f28, %f0, %f0
685	camellia_fl	%f28, %f4, %f4
686	camellia_fli	%f30, %f2, %f2
687	camellia_fli	%f30, %f6, %f6
688	ldd		[$key + 256], %f28
689	ldd		[$key + 264], %f30
690___
691for ($i=1; $i<3; $i++) {
692    $code.=<<___;
693	camellia_f	%f`16+16*$i+0`, %f2, %f0, %f2
694	camellia_f	%f`16+16*$i+0`, %f6, %f4, %f6
695	camellia_f	%f`16+16*$i+2`, %f0, %f2, %f0
696	camellia_f	%f`16+16*$i+2`, %f4, %f6, %f4
697	camellia_f	%f`16+16*$i+4`, %f2, %f0, %f2
698	camellia_f	%f`16+16*$i+4`, %f6, %f4, %f6
699	camellia_f	%f`16+16*$i+6`, %f0, %f2, %f0
700	camellia_f	%f`16+16*$i+6`, %f4, %f6, %f4
701	camellia_f	%f`16+16*$i+8`, %f2, %f0, %f2
702	camellia_f	%f`16+16*$i+8`, %f6, %f4, %f6
703	camellia_f	%f`16+16*$i+10`, %f0, %f2, %f0
704	camellia_f	%f`16+16*$i+10`, %f4, %f6, %f4
705	camellia_fl	%f`16+16*$i+12`, %f0,      %f0
706	camellia_fl	%f`16+16*$i+12`, %f4,      %f4
707	camellia_fli	%f`16+16*$i+14`, %f2,      %f2
708	camellia_fli	%f`16+16*$i+14`, %f6,      %f6
709___
710}
711$code.=<<___;
712	camellia_f	%f16, %f2, %f0, %f2
713	camellia_f	%f16, %f6, %f4, %f6
714	camellia_f	%f18, %f0, %f2, %f0
715	camellia_f	%f18, %f4, %f6, %f4
716	ldd		[$key + 16], %f16
717	ldd		[$key + 24], %f18
718	camellia_f	%f20, %f2, %f0, %f2
719	camellia_f	%f20, %f6, %f4, %f6
720	camellia_f	%f22, %f0, %f2, %f0
721	camellia_f	%f22, %f4, %f6, %f4
722	ldd		[$key + 32], %f20
723	ldd		[$key + 40], %f22
724	camellia_f	%f24, %f2, %f0, %f8
725	camellia_f	%f24, %f6, %f4, %f10
726	camellia_f	%f26, %f0, %f8, %f2
727	camellia_f	%f26, %f4, %f10, %f6
728	ldd		[$key + 48], %f24
729	ldd		[$key + 56], %f26
730	fxor		%f28, %f8, %f0
731	fxor		%f28, %f10, %f4
732	fxor		%f30, %f2, %f2
733	fxor		%f30, %f6, %f6
734	ldd		[$key + 64], %f28
735	retl
736	ldd		[$key + 72], %f30
737.type	_cmll256_encrypt_2x,#function
738.size	_cmll256_encrypt_2x,.-_cmll256_encrypt_2x
739
740.align	32
741_cmll256_decrypt_1x:
742	camellia_f	%f16, %f2, %f0, %f2
743	camellia_f	%f18, %f0, %f2, %f0
744	ldd		[$key - 8], %f16
745	ldd		[$key - 16], %f18
746	camellia_f	%f20, %f2, %f0, %f2
747	camellia_f	%f22, %f0, %f2, %f0
748	ldd		[$key - 24], %f20
749	ldd		[$key - 32], %f22
750	camellia_f	%f24, %f2, %f0, %f2
751	camellia_f	%f26, %f0, %f2, %f0
752	ldd		[$key - 40], %f24
753	ldd		[$key - 48], %f26
754	camellia_fl	%f28, %f0, %f0
755	camellia_fli	%f30, %f2, %f2
756	ldd		[$key - 56], %f28
757	ldd		[$key - 64], %f30
758___
759for ($i=1; $i<3; $i++) {
760    $code.=<<___;
761	camellia_f	%f`16+16*$i+0`, %f2, %f0, %f2
762	camellia_f	%f`16+16*$i+2`, %f0, %f2, %f0
763	camellia_f	%f`16+16*$i+4`, %f2, %f0, %f2
764	camellia_f	%f`16+16*$i+6`, %f0, %f2, %f0
765	camellia_f	%f`16+16*$i+8`, %f2, %f0, %f2
766	camellia_f	%f`16+16*$i+10`, %f0, %f2, %f0
767	camellia_fl	%f`16+16*$i+12`, %f0,      %f0
768	camellia_fli	%f`16+16*$i+14`, %f2,      %f2
769___
770}
771$code.=<<___;
772	camellia_f	%f16, %f2, %f0, %f2
773	camellia_f	%f18, %f0, %f2, %f0
774	ldd		[$key + 184], %f16
775	ldd		[$key + 176], %f18
776	camellia_f	%f20, %f2, %f0, %f2
777	camellia_f	%f22, %f0, %f2, %f0
778	ldd		[$key + 168], %f20
779	ldd		[$key + 160], %f22
780	camellia_f	%f24, %f2, %f0, %f4
781	camellia_f	%f26, %f0, %f4, %f2
782	ldd		[$key + 152], %f24
783	ldd		[$key + 144], %f26
784	fxor		%f30, %f4, %f0
785	fxor		%f28, %f2, %f2
786	ldd		[$key + 136], %f28
787	retl
788	ldd		[$key + 128], %f30
789.type	_cmll256_decrypt_1x,#function
790.size	_cmll256_decrypt_1x,.-_cmll256_decrypt_1x
791
792.align	32
793_cmll256_decrypt_2x:
794	camellia_f	%f16, %f2, %f0, %f2
795	camellia_f	%f16, %f6, %f4, %f6
796	camellia_f	%f18, %f0, %f2, %f0
797	camellia_f	%f18, %f4, %f6, %f4
798	ldd		[$key - 8], %f16
799	ldd		[$key - 16], %f18
800	camellia_f	%f20, %f2, %f0, %f2
801	camellia_f	%f20, %f6, %f4, %f6
802	camellia_f	%f22, %f0, %f2, %f0
803	camellia_f	%f22, %f4, %f6, %f4
804	ldd		[$key - 24], %f20
805	ldd		[$key - 32], %f22
806	camellia_f	%f24, %f2, %f0, %f2
807	camellia_f	%f24, %f6, %f4, %f6
808	camellia_f	%f26, %f0, %f2, %f0
809	camellia_f	%f26, %f4, %f6, %f4
810	ldd		[$key - 40], %f24
811	ldd		[$key - 48], %f26
812	camellia_fl	%f28, %f0, %f0
813	camellia_fl	%f28, %f4, %f4
814	camellia_fli	%f30, %f2, %f2
815	camellia_fli	%f30, %f6, %f6
816	ldd		[$key - 56], %f28
817	ldd		[$key - 64], %f30
818___
819for ($i=1; $i<3; $i++) {
820    $code.=<<___;
821	camellia_f	%f`16+16*$i+0`, %f2, %f0, %f2
822	camellia_f	%f`16+16*$i+0`, %f6, %f4, %f6
823	camellia_f	%f`16+16*$i+2`, %f0, %f2, %f0
824	camellia_f	%f`16+16*$i+2`, %f4, %f6, %f4
825	camellia_f	%f`16+16*$i+4`, %f2, %f0, %f2
826	camellia_f	%f`16+16*$i+4`, %f6, %f4, %f6
827	camellia_f	%f`16+16*$i+6`, %f0, %f2, %f0
828	camellia_f	%f`16+16*$i+6`, %f4, %f6, %f4
829	camellia_f	%f`16+16*$i+8`, %f2, %f0, %f2
830	camellia_f	%f`16+16*$i+8`, %f6, %f4, %f6
831	camellia_f	%f`16+16*$i+10`, %f0, %f2, %f0
832	camellia_f	%f`16+16*$i+10`, %f4, %f6, %f4
833	camellia_fl	%f`16+16*$i+12`, %f0,      %f0
834	camellia_fl	%f`16+16*$i+12`, %f4,      %f4
835	camellia_fli	%f`16+16*$i+14`, %f2,      %f2
836	camellia_fli	%f`16+16*$i+14`, %f6,      %f6
837___
838}
839$code.=<<___;
840	camellia_f	%f16, %f2, %f0, %f2
841	camellia_f	%f16, %f6, %f4, %f6
842	camellia_f	%f18, %f0, %f2, %f0
843	camellia_f	%f18, %f4, %f6, %f4
844	ldd		[$key + 184], %f16
845	ldd		[$key + 176], %f18
846	camellia_f	%f20, %f2, %f0, %f2
847	camellia_f	%f20, %f6, %f4, %f6
848	camellia_f	%f22, %f0, %f2, %f0
849	camellia_f	%f22, %f4, %f6, %f4
850	ldd		[$key + 168], %f20
851	ldd		[$key + 160], %f22
852	camellia_f	%f24, %f2, %f0, %f8
853	camellia_f	%f24, %f6, %f4, %f10
854	camellia_f	%f26, %f0, %f8, %f2
855	camellia_f	%f26, %f4, %f10, %f6
856	ldd		[$key + 152], %f24
857	ldd		[$key + 144], %f26
858	fxor		%f30, %f8, %f0
859	fxor		%f30, %f10, %f4
860	fxor		%f28, %f2, %f2
861	fxor		%f28, %f6, %f6
862	ldd		[$key + 136], %f28
863	retl
864	ldd		[$key + 128], %f30
865.type	_cmll256_decrypt_2x,#function
866.size	_cmll256_decrypt_2x,.-_cmll256_decrypt_2x
867___
868
869&alg_cbc_encrypt_implement("cmll",128);
870&alg_cbc_encrypt_implement("cmll",256);
871
872&alg_cbc_decrypt_implement("cmll",128);
873&alg_cbc_decrypt_implement("cmll",256);
874
875if ($::evp) {
876    &alg_ctr32_implement("cmll",128);
877    &alg_ctr32_implement("cmll",256);
878}
879}}}
880
881if (!$::evp) {
882$code.=<<___;
883.global	Camellia_encrypt
884Camellia_encrypt=cmll_t4_encrypt
885.global	Camellia_decrypt
886Camellia_decrypt=cmll_t4_decrypt
887.global	Camellia_set_key
888.align	32
889Camellia_set_key:
890	andcc		%o2, 7, %g0		! double-check alignment
891	bnz,a,pn	%icc, 1f
892	mov		-1, %o0
893	brz,a,pn	%o0, 1f
894	mov		-1, %o0
895	brz,a,pn	%o2, 1f
896	mov		-1, %o0
897	andncc		%o1, 0x1c0, %g0
898	bnz,a,pn	%icc, 1f
899	mov		-2, %o0
900	cmp		%o1, 128
901	bl,a,pn		%icc, 1f
902	mov		-2, %o0
903	b		cmll_t4_set_key
904	nop
9051:	retl
906	nop
907.type	Camellia_set_key,#function
908.size	Camellia_set_key,.-Camellia_set_key
909___
910
911my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
912
913$code.=<<___;
914.globl	Camellia_cbc_encrypt
915.align	32
916Camellia_cbc_encrypt:
917	ld		[$key + 272], %g1
918	nop
919	brz		$enc, .Lcbc_decrypt
920	cmp		%g1, 3
921
922	be,pt		%icc, cmll128_t4_cbc_encrypt
923	nop
924	ba		cmll256_t4_cbc_encrypt
925	nop
926
927.Lcbc_decrypt:
928	be,pt		%icc, cmll128_t4_cbc_decrypt
929	nop
930	ba		cmll256_t4_cbc_decrypt
931	nop
932.type	Camellia_cbc_encrypt,#function
933.size	Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
934___
935}
936
937&emit_assembler();
938
939close STDOUT;
940